merge reducemax(float16) reducesum(float16) div(float16) negate(float16)…

merge reducemax(float16) reducesum(float16) div(float16) negate(float16) multiplydidm(float16) matrixmul(all,float16/int8) sum(float16/int/int8) sign/sub/sumdim/subdim( float16)

merge reducemax(float16) reducesum(float16) div(float16) negate(float16)…
merge reducemax(float16) reducesum(float16) div(float16) negate(float16) multiplydidm(float16) matrixmul(all,float16/int8) sum(float16/int/int8) sign/sub/sumdim/subdim( float16)
3501c0fa · ltb · fc5a630a · 3501c0fa · fc5a630a · 3501c0fa
Commit 3501c0fa authored Jul 29, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
-/* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
- * All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-/*
- * backward computation for data operation
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
- */
-#include "XNoder.h"
-#include "XBackwardData.h"
-#include "../tensor/XName.h"
-#include "../tensor/XUtility.h"
-#include "../tensor/core/CHeader.h"
-#include "../tensor/core/getandset/SetData.h"
-namespace nts{
-/* compute dE/dx of a node */
-void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
-{
-    CheckNTErrors(node->grad != NULL, "No gradient found!");
-    XLink &income = node->income;
-    int operID = income.typeID;
-    if(operID == GETANDSET_CONVERTDATATYPE)
-        GradConvertDataType(node, isEfficent);
-    else if(operID == GETANDSET_INDEXTOONEHOT)
-        GradIndexToOnehot(node, isEfficent);
-    else if(operID == GETANDSET_ONEHOTTOINDEX)
-        GradOnehotToIndex(node, isEfficent);
-    else{
-        ShowNTErrors("TODO!");
-    }
-}
-/* indicates whether the node is for a data operation */
-bool XDataGrad::IsDataOP(XTensor * node)
-{
-    XLink &income = node->income;
-    return (income.typeID & DATA_BASE) != 0;
-}
-/* 
-gradient computation for convert datatype
-for
-b = converdatatype(a) 
-we have
-dE/da = convertdatatype(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-	XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
-    _ConvertDataType(node->grad, tmp);
-	_SumMe(input->grad, tmp);
-	DelTensorBuf(tmp);
-	node->visitMark = NODE_FINISHED;
-}
-/* 
-gradient computation for OnehotToIndex
-for
-b = OnehotToIndex(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-	node->visitMark = NODE_FINISHED;
-}
-/* 
-gradient computation for IndexToOnehot
-for
-b = IndexToOnehot(a) 
-we have
-dE/da = IndexToOnehot(b)
->> node - the node (c) for backward computation
->> isEfficient - indicates whether the computation is in
-                 an efficient manner
-*/
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
-{
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
-    XTensor * input = income.tails[0];
-    XNoder::MakeGrad(input);
-	node->visitMark = NODE_FINISHED;
-}
-} // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -52,7 +52,15 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * dedy = output->grad;
    if (income.tailNum == 1) {
-        _SetDataFixed(dedy, 1.0F);
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
        return;
    }
@@ -136,7 +144,15 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
                        LOSS_FUNCTION_NAME lossName)
 {
    if(gold == NULL){
-        _SetDataFixed(dedy, 1.0F);
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else{
+            ShowNTErrors("TODO");
+        }
        return;
    }

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -735,7 +735,7 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
-        _Multiply(node->grad, a, b->grad, 1.0F);;
+        _Multiply(node->grad, a, b->grad, 1.0F);
    }
    node->visitMark = NODE_FINISHED;
@@ -855,7 +855,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
    XNoder::MakeGrad(a);
    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
@@ -1319,7 +1318,7 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
+    //DTYPE beta = income.GetParam(0);
    XNoder::MakeGrad(a);
    _Sum(a->grad, node->grad, a->grad);

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -68,7 +68,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
 bool XShapeGrad::IsShapeOP(XTensor * node)
 {
    XLink &income = node->income;
-    return (income.typeID & SHAPE_BASE) != 0;
+    return (income.typeID & DATA_BASE) != 0;
 }
 /* post processing of a node */
@@ -271,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
    XTensor * last = NULL;
-    XList smalls(income.tailNum);
+    TensorList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -440,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,7 +54,7 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);
-    /* gradient computation for dropout with indexs */
+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
    static
    void GradDropoutWithIndex(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -37,16 +37,16 @@ struct XNet
    unsigned int id;
    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;
    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;
    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;
    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;
    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -71,15 +71,15 @@ struct XNet
    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -93,10 +93,10 @@ struct XNet
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);
    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);
    /* dump network information */
    void Dump(FILE * file);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -247,13 +247,13 @@ void Check(FNNModel &model)
 /* make a hard copy of the fnn model */
 void Copy(FNNModel &tgt, FNNModel &src)
 {
-    InitTensor(&tgt.embeddingW, &src.embeddingW);
+    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
+        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
+        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
    }
-    InitTensor(&tgt.outputW, &src.outputW);
+    InitTensorV2(&tgt.outputW, &src.outputW);
-    InitTensor(&tgt.outputB, &src.outputB);
+    InitTensorV2(&tgt.outputB, &src.outputB);
    tgt.n = src.n;
    tgt.eSize = src.eSize;
@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
 */
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
 {
-    InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem);
+    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
 }
 /* 
@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
 */
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
 }
@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the gold standard */
            XTensor gold;
+            /* the loss tensor */
+            XTensor lossTensor;
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                /* forward computation */
                Forward(inputs, output, model, net);
                /* backward computation to obtain gradients */
                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
@@ -481,11 +486,13 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 				/* this is implemented by gather function */
                ForwardAutoDiff(ngrams, ngramNum, output, model);
-				///* this is implemented by multiply function */
+				/* this is implemented by multiply function */
 				//ForwardAutoDiff(inputs, output, model);
+                lossTensor = CrossEntropy(output, gold);
                /* automatic differentiation */
-                autoDiffer.Backward(output, gold, CROSSENTROPY);
+                autoDiffer.Backward(lossTensor);
+                //autoDiffer.Backward(output, gold, CROSSENTROPY);
                /* update model parameters */
                Update(model, grad, learningRate, true);
@@ -494,7 +501,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* get probabilities */
            float prob = GetProb(output, gold);
-            loss += -prob;
+            prob = ReduceSumAll(lossTensor);
+            loss += prob;
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
@@ -537,8 +546,8 @@ update the model parameters using the delta rule
 */
 void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 {
-    XList paraList(10);
+    TensorList paraList(10);
-    XList gradList(10);
+    TensorList gradList(10);
    paraList.Add(&model.outputW);
    paraList.Add(&model.outputB);
@@ -595,14 +604,14 @@ get prediction probabilites of the gold words
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensor(&probs, &output);
+    InitTensorV2(&probs, &output);
    /* probs[i,j] = output[i,j] * gold[i,j] */
    _Multiply(&output, &gold, &probs);
    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem);
+    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
    _ReduceSum(&probs, &wprobs, 1);
    if(wordProbs != NULL)
        _CopyValues(&wprobs, wordProbs);
@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
    _ReduceSum(&probs, &result, 1);
    return result.Get1D(0);
@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise.
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
                         int itemNum, int devID, XMem * mem)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
    tensor.SetZeroAll();
@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    int batchSize = -1;
    int n = model.n;
    int depth = model.hDepth;
-    XList eList(n - 1);
+    TensorList eList(n - 1);
    /* previoius n - 1 words */
    for(int i = 0; i < n - 1; i++){
@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* make a 2d tensor for the bias term */
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);
        /* introduce bias term:
@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);
        _Sum(&s, &b2D, &s);
@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor dedsHidden;
    XTensor dedxBottom;
    if (depth > 0)
-        InitTensor(&dedsHidden, &dedx);
+        InitTensorV2(&dedsHidden, &dedx);
-    InitTensor(&dedxBottom, &net.embeddingCat);
+    InitTensorV2(&dedxBottom, &net.embeddingCat);
    /* back-propagation from top to bottom in the stack of hidden layers
       for each layer, h = f(s)
@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
            _CopyValues(&dedx, &gradPassed);
    }
-    XList eList(n - 1);
+    TensorList eList(n - 1);
    /* back-propagation for the embedding layer */
    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem);
+        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
        eList.Add(dedy);
    }
@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }
-    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    InitTensor1DV2(&words, size, X_INT, model.devID);
    words.SetData(index, size);
    embeddingBig = Gather(model.embeddingW, words);
@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
 }
 /*
@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    XTensor hidden;
    XTensor b;
-    XList inputList(n - 1);
+    TensorList inputList(n - 1);
    for(int i = 0; i < n - 1; i++)
        inputList.Add(inputs + i);
@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model)
        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, ngramNum);
+        InitTensor1DV2(&probs, ngramNum);
        /* get probabilities */
        float prob = GetProb(output, gold, &probs);
@@ -1200,7 +1210,6 @@ void Test(const char * test, const char * result, FNNModel &model)
    }
    fclose(file);
-    fclose(ofile);
    double elapsed = GetClockSec() - startT;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    XTensor q2;
    XTensor v2;
    XTensor kqv2;
-    XList split;
+    TensorList split;
    kqv2 = MMul(kqv, wbig);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but i'll move to a warm place tomorrow :)
+ */
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+#include "../../network/XNet.h"
+using namespace nts;
+namespace transformer
+{
+#define MAX_SEQUENCE_LENGTH 1024 * 16
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+    /* end position */
+    int end;
+    /* maximum word number on the encoder side */
+    int maxEnc;
+    /* maximum word number on the decoder side */
+    int maxDec;
+    /* a key for sorting */
+    int key;
+};
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int * buf;
+    /* another buffer */
+    int * buf2;
+    /* batch buf */
+    BatchNode * bufBatch;
+    /* buffer size */
+    int bufSize;
+    /* size of batch buffer */
+    int bufBatchSize;
+    /* length of each sequence */
+    int * seqLen;
+    /* another array */
+    int * seqLen2;
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+    /* number of sequences in the buffer */
+    int nseqBuf;
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+    /* offset for next batch */
+    int nextBatch;
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+    /* randomize batches */
+    bool isRandomBatch;
+    /* bucket size */
+    int bucketSize;
+public:
+    /* constructor */
+    T2TBatchLoader();
+    /* de-constructor */
+    ~T2TBatchLoader();
+    /* initialization */
+    void Init(int argc, char ** argv);
+    /* load data to buffer */
+    int LoadBuf(FILE * file, bool isSorted, int step);
+    /* clear data buffer */
+    void ClearBuf();
+    /* set the random batch flag */
+    void SetRandomBatch(bool flag = true);
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold, XTensor * label,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &ws, int &wCount,
+                  int devID, XMem * mem, 
+				  bool isTraining);
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &ws, int &wCount,
+                    int devID, XMem * mem, 
+					bool isTraining);
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,6 +31,10 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    attentionsEnde = NULL;
    attEndeLayerNorms = NULL;
 }
@@ -38,6 +42,10 @@ AttDecoder::AttDecoder()
 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
+    delete[] attentions;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
    delete[] attentionsEnde;
    delete[] attEndeLayerNorms;
 }
@@ -68,7 +76,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
    /* embedding model */
    embedder.InitModel(argc, argv, devID, mem, false);
@@ -160,6 +168,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        x = fnnLayerNorms[i].Make(res);
    }
+    x.SetName(DECODING_NAME);
    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -27,6 +27,9 @@
 namespace transformer
 {
+#define DECODING_NAME "decoding"
+#define DECODING_INPUT_NAME "decoding_input"
 class AttDecoder
 {
 public:

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -140,6 +140,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        x = fnnLayerNorms[i].Make(res);
    }
+    x.SetName(ENCODING_NAME);
+    input.SetName(ENCODING_INPUT_NAME);
    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -33,6 +33,9 @@ using namespace nts;
 namespace transformer
 {
+#define ENCODING_NAME "encoding"
+#define ENCODING_INPUT_NAME "encoding_input"
 /* 
 base class of the encoder 
 */

--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "../../tensor/core/CHeader.h"
+#include "T2TLengthPenalty.h"
+using namespace nts;
+namespace transformer
+{
+/* 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+where n = length of the sequence 
+>> length - length of the sequence (for each entry)
+>> alpha - the parameter controls the length preference
+<< return - length penaltyof the sequence (for each entry)
+*/
+XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
+{
+    XTensor base;
+    XTensor lp;
+    base = (length + 5)/(1 + 5);
+    lp = Power(base, alpha);
+    return lp;
+}
+}
--- a/source/network/XBackwardData.h
+++ b/source/network/XBackwardData.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,45 +16,33 @@
 */
 /*
- * backward computation for data operation
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
- * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
 */
-#include "../tensor/XTensor.h"
+#ifndef __T2TLENGTHPENALTY_H__
-#include "../tensor/function/FHeader.h"
+#define __T2TLENGTHPENALTY_H__
-#ifndef __XBACKWARDDATA_H__
+#include "../../tensor/XTensor.h"
-#define __XBACKWARDDATA_H__
-namespace nts{
+using namespace nts;
-/* this class computes the gradient for tensor data operation given a node */
+namespace transformer
-class XDataGrad
 {
-public:
-    /* compute dE/dx of a node */
-    static
-    void MakeGrad(XTensor * node, bool isEfficent);
-    /* indicates whether the node is for a shaping operation */
-    static
-    bool IsDataOP(XTensor * node);
-private:
-    /* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
+/* We intend to penalize short sequences because they have higher score
-    static
+   in product of a sequence of probability-like terms and have more chances
-    void GradConvertDataType(XTensor * node, bool isEfficent);
+   to beat others in search. */
+class T2TLengthPenalizer
-    /* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
+{
-    static
+public:
-    void GradIndexToOnehot(XTensor * node, bool isEfficent);
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+       where n = length of the sequence */
-    /* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
    static
-    void GradOnehotToIndex(XTensor * node, bool isEfficent);
+    XTensor GNMT(const XTensor & length, float alpha);
 };
-} // namespace nts(NiuTrans.Tensor)
+}
 #endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -23,6 +23,7 @@
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "../../tensor/core/CHeader.h"
+#include "../../tensor/XUtility.h"
 namespace transformer
 {
@@ -44,10 +45,13 @@ T2TModel::T2TModel()
 /* de-constructor */
 T2TModel::~T2TModel()
 {
-    delete mem;
    delete encoder;
    delete decoder;
    delete outputLayer;
+    /* we delete "mem" at the end because other members are using it and we must
+       remove the memory space before all tensors are destroyed. */
+    delete mem;
 }
 /* 
@@ -81,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv)
    if(isMT)
        decoder->InitModel(argc, argv, true, 0, devID, mem);
-    XList params(10);
+    TensorList params(10);
    GetParams(params);
    for(int i = 0; i < params.count; i++){
@@ -359,12 +363,17 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
    dims[inputDec.order + 1] = len;
    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
-    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
-     this matrix can be used to prevent the attention to current or following words in
+       This matrix can be used to block the attention to current or following words in
       a given sequence. */
    _SetDataLowTri(&maskDec, 1e9F, 0);
+    //maskDec.Dump(stderr, "mask: ");
    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+    //maskDec.Dump(stderr, "mask: ");
    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
@@ -374,9 +383,18 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    //paddingEnc.Dump(stderr, "paddingenc:");
+    //maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    //maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
+    //maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
    delete[] dims;
@@ -385,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(XList &list)
+void T2TModel::GetParams(TensorList &list)
 {
    list.Clear();
    list.Add(&outputLayer->w);
@@ -442,10 +460,12 @@ dump the parameters
 */
 void T2TModel::Dump(const char * fn)
 {
+    double startT = GetClockSec();
    FILE * file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");
-    XList params(100);
+    TensorList params(100);
    GetParams(params);
@@ -456,16 +476,20 @@ void T2TModel::Dump(const char * fn)
    fclose(file);
-    XPRINT(0, stderr, "[INFO] model saved\n");
+    double elapsed = GetClockSec() - startT;
+    XPRINT1(0, stderr, "[INFO] model saved (took %.1fs)\n", elapsed);
 }
 /* read the parameters */
 void T2TModel::Read(const char * fn)
 {
+    double startT = GetClockSec();
    FILE * file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot open the model file");
-    XList params(100);
+    TensorList params(100);
    GetParams(params);
@@ -476,7 +500,9 @@ void T2TModel::Read(const char * fn)
    fclose(file);
-    XPRINT(0, stderr, "[INFO] model loaded\n");
+    double elapsed = GetClockSec() - startT;
+    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
 }
 }
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -98,7 +98,7 @@ public:
                       XTensor &maskDec, XTensor &maskEncDec);
    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);
    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -95,6 +95,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
    //output = LogSoftmax(MMul(x, w), -1);
    output = Softmax(MMul(x, w), -1);
+    output.SetName(OUTPUT_NAME);
 }
 }
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -29,6 +29,8 @@ using namespace nts;
 namespace transformer
 {
+#define OUTPUT_NAME "output"
 /* output layer */
 class T2TOutput
 {

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ */
+#include "T2TPredictor.h"
+#include "../../tensor/core/CHeader.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+T2TStateBundle::T2TStateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+/* de-constructor */
+T2TStateBundle::~T2TStateBundle()
+{
+    if(states != NULL)
+        delete[] states;
+}
+/* 
+create states 
+>> num - number of states
+*/
+void T2TStateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+    if(states != NULL)
+        delete[] states;
+    states = new T2TState[num];
+    for(int i = 0; i < num; i++){
+        states[i].prediction = -1;
+        states[i].pid = T2T_PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+    stateNum = num;
+}
+/* constructor */
+T2TPredictor::T2TPredictor()
+{
+    startSymbol = -1;
+}
+/* de-constructor */
+T2TPredictor::~T2TPredictor()
+{
+}
+/* 
+create an initial state 
+>> model - the t2t model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+{
+    state->layersEnc.Clear();
+    state->layersDec.Clear();
+    XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
+    CheckNTErrors(encoding != NULL, "No encoding layers found!");
+    state->layersEnc.Add(encoding);
+    state->layersDec.Add(NULL);
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->GetDim(i);
+    dims[input->order - 1] = beamSize;
+    InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
+    state->probPath.SetZeroAll();
+    state->nstep.SetZeroAll();
+    state->endMark.SetZeroAll();
+    state->stateNum = 0;
+}
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void T2TPredictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+/* 
+read a state 
+>> model - the t2t model that keeps the network created so far
+>> state - a set of states. It keeps
+             1) hypotheses (states)
+             2) probablities of hypotheses
+             3) parts of the network for expanding toward the next state
+*/
+void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+{
+    m = model;
+    s = state;
+}
+/*
+predict the next state
+>> next - next states (assuming that the current state has been read)
+>> encoding - encoder output
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder
+*/
+void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
+                           XTensor * inputEnc, XTensor * paddingEnc)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    next->layersEnc.Clear();
+    next->layersDec.Clear();
+    AttDecoder &decoder = *m->decoder;
+    /* word indices of previous positions */
+    XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+    /* the first token */
+    XTensor first;
+    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
+    for(int i = 0; i < inputEnc->order - 1; i++)
+        dims[i] = inputEnc->GetDim(i);
+    dims[inputEnc->order - 1] = 1;
+    InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
+    _SetDataFixedInt(&first, startSymbol);
+    /* add a new word into the input sequence of the decoder side */
+    if (inputLast == NULL) {
+        inputDec = Identity(first);
+    }
+    else{
+        inputDec = GeneratePaths(s);
+        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
+        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
+    }
+    /* prediction probabilities */
+    XTensor &output = next->prob;
+    XTensor decoding;
+    XTensor decodingStep;
+    for(int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.GetDim(i);
+    dims[inputDec.order - 1] = inputDec.GetDim(-1);
+    XTensor paddingDec;
+    InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
+    SetDataFixedInt(paddingDec, 1);
+    XTensor maskDec;
+    XTensor maskEncDec;
+    /* decoder mask */
+    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
+    /* make the decoding network */
+    decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
+    XTensor selectSrc;
+    XTensor selectTgt;
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+    int stride = decoding.GetDim(decoding.order - 2);
+    InitTensor1D(&selectSrc, 1, X_INT);
+    InitTensor1D(&selectTgt, 1, X_INT);
+    selectSrc.SetInt(stride - 1, 0);
+    selectTgt.SetInt(0, 0);
+    selectSrc.SetDevice(decoding.devID, decoding.mem);
+    selectTgt.SetDevice(decoding.devID, decoding.mem);
+    /* the decoder output of the last position */
+    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
+    /* generate the output probabilities */
+    m->outputLayer->Make(decodingStep, output);
+    _LogMe(&output);
+    next->layersEnc.AddList(&s->layersEnc);
+    next->layersDec.Add(&inputDec);
+    next->layersDec.Add(&output);
+}
+/* 
+generate paths up to the states of the current step 
+>> state - state bundle of the current step
+*/
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+    int distance = -1;
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+        while(cur != NULL){
+            nsteps++;
+            cur = cur->last;
+        }
+        if(nsteps > distance)
+            distance = nsteps;
+    }
+    XTensor path;
+    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+        while(cur != NULL){
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+    return path;
+}
+}
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ */
+#ifndef __T2TPREDICTOR_H__
+#define __T2TPREDICTOR_H__
+#include "T2TModel.h"
+#include "T2TLengthPenalty.h"
+namespace transformer
+{
+#define T2T_PID_EMPTY -1
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypothsis in translation. */
+class T2TState
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+    /* id of the problem. One can regard it as the sentence id when we 
+       translate a number of sentences in the batched manner. The hypothesis 
+       is empty if id = -1 */
+    int pid;
+    /* indicates whether the state is an end */
+    bool isEnd;
+    /* indicates whether the state is the start */
+    bool isStart;
+    /* indicates whether the state is completed */
+    bool isCompleted;
+    /* probability of every prediction (last state of the path) */
+    float prob;
+    /* probability of every path */
+    float probPath;
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+    /* nubmer of steps we go over so far */
+    int nstep;
+    /* pointer to the previous state */
+    T2TState * last;
+};
+/* a bundle of states */
+class T2TStateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+    /* mark that indicates whether each hypothesis is completed */
+    XTensor endMark;
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+    /* probability of every path */
+    XTensor probPath;
+    /* model score of every path */
+    XTensor modelScore;
+    /* step number of each hypothesis */
+    XTensor nstep;
+    /* layers on the encoder side. We actually use the encoder output instead
+       of all hidden layers. */
+    TensorList layersEnc;
+    /* layers on the decoder side */
+    TensorList layersDec;
+    /* list of states */
+    T2TState * states;
+    /* number of states */
+    int stateNum;
+    /* indicates whether it is the first state */
+    bool isStart;
+public:
+    /* constructor */
+    T2TStateBundle();
+    /* de-constructor */
+    ~T2TStateBundle();
+    /* create states */
+    void MakeStates(int num);
+};
+/* The predictor reads the current state and then predicts the next. 
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regared as the representation of words (word 
+   indices, hidden states, embeddings and etc.).  */
+class T2TPredictor
+{
+private:
+    /* pointer to the transformer model */
+    T2TModel * m;
+    /* current state */
+    T2TStateBundle * s;
+    /* start symbol */
+    int startSymbol;
+public:
+    /* constructor */
+    T2TPredictor();
+    /* de-constructor */
+    ~T2TPredictor();
+    /* create an initial state */
+    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+    /* read a state */
+    void Read(T2TModel * model, T2TStateBundle * state);
+    /* predict the next state */
+    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(T2TStateBundle * state);
+};
+}
+#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+#ifndef __T2TSEARCH_H__
+#define __T2TSEARCH_H__
+#include "T2TModel.h"
+#include "T2TPredictor.h"
+namespace transformer
+{
+/* The class orgnizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path respresents a translation hypothsis.
+   The output can be the path with the highest model score. */
+class T2TSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+    /* predictor */
+    T2TPredictor predictor;
+    /* max length of the generated sequence */
+    int maxLength;
+    /* beam size */
+    int beamSize;
+    /* batch size */
+    int batchSize;
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float> * fullHypos;
+    /* array of the end symbols */
+    int * endSymbols;
+    /* number of the end symbols */
+    int endSymbolNum;
+    /* start symbol */
+    int startSymbol;
+public:
+    /* constructor */
+    T2TSearch();
+    /* de-constructor */
+    ~T2TSearch();
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+    /* search for the most promising states */
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, 
+                XTensor * output, XTensor * score);
+    /* preparation */
+    void Prepare(int myBatchSize,int myBeamSize);
+    /* compute the model score for each hypothesis */
+    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    /* generate token indices via beam pruning */
+    void Generate(T2TStateBundle * beam);
+    /* expand the search graph */
+    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    /* collect hypotheses with ending symbol */
+    void Collect(T2TStateBundle * beam);
+    /* fill the hypotheis heap with incomplete hypothses */
+    void FillHeap(T2TStateBundle * beam);
+    /* save the output sequences in a tensor */
+    void Dump(XTensor * output, XTensor * score);
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+    /* set end symbols for search */
+    void SetEnd(const int * tokens, const int tokenNum);
+    /* check whether all hypotheses are completed */
+    bool IsAllCompleted(T2TStateBundle * beam);
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(T2TStateBundle * beam);
+};
+}
+#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+#include <math.h>
+#include "T2TUtility.h"
+#include "T2TTester.h"
+#include "T2TSearch.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+using namespace nts;
+namespace transformer
+{
+/* constructor */
+T2TTester::T2TTester()
+{
+}
+/* de-constructor */
+T2TTester::~T2TTester()
+{
+}
+/* initialize the model */
+void T2TTester::Init(int argc, char ** argv)
+{
+    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
+    batchLoader.Init(argc, argv);
+    seacher.Init(argc, argv);
+}
+/* 
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+{
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+    float loss = 0;
+    /* data files */
+    FILE * file = fopen(fn, "rb");
+    CheckNTErrors(file, "Cannot read the test file");
+    FILE * ofile = fopen(ofn, "wb");
+    CheckNTErrors(ofile, "Cannot open the output file");
+    int devID = model->devID;
+    XMem * mem = model->mem;
+    XNet net;
+    double startT = GetClockSec();
+    wordCount = 0;
+    /* batch of input sequences */
+    XTensor batchEnc;
+    XTensor batchDec;
+    /* label */
+    XTensor label;
+    /* padding */
+    XTensor paddingEnc;
+    XTensor paddingDec;
+    /* gold standard */
+    XTensor gold;
+    /* an array that keeps the sequences */
+    int * seqs = new int[MILLION];
+    batchLoader.SetRandomBatch(false);
+    batchLoader.ClearBuf();
+    while(batchLoader.LoadBatch(file, model->isLM, 
+                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
+                                seqs, vSize, vSizeTgt,
+                                1, 1, false, ws, wc, devID, mem, false))
+    {
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
+        CheckNTErrors(!model->isLM, "Only MT model is supported!");
+        XTensor output;
+        XTensor score;
+        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
+        Dump(ofile, &output);
+        float prob = 0;
+        loss += -prob;
+        wc = batchEnc.GetDim(-1);
+        wordCount += wc;
+        wordCountTotal += wc;
+        sentCount += batchEnc.GetDim(-2);
+        batchCount += 1;
+        if (batchCount % 1 == 0) {
+            double elapsed = GetClockSec() - startT;
+            XPRINT3(0, stderr, 
+                   "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
+                    elapsed, sentCount, wordCount);
+        }
+    }
+    fclose(file);
+    fclose(ofile);
+    delete[] seqs;
+    double elapsed = GetClockSec() - startT;
+    XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
+            elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
+}
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void T2TTester::Dump(FILE * file, XTensor * output)
+{
+    int seqLength = output->GetDim(-1);
+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
+            int w = output->GetInt(i + j);
+            fprintf(file, "%d ", w);
+            if (w < 0)
+                break;
+        }
+        fprintf(file, "\n");
+    }
+}
+}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * A week with no trips :)
+ */
+#ifndef __T2TTESTER_H__
+#define __T2TTESTER_H__
+#include "T2TSearch.h"
+#include "T2TBatchLoader.h"
+namespace transformer
+{
+/* This class translates test sentences with a trained model. */
+class T2TTester
+{
+public:
+    /* vocabulary size of the source side */
+    int vSize;
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+    /* for batching */
+    T2TBatchLoader batchLoader;
+    /* decoder for inference */
+    T2TSearch seacher;
+public:
+    /* constructor */
+    T2TTester();
+    /* de-constructor */
+    ~T2TTester();
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+    /* dump the result into the file */
+    void Dump(FILE * file, XTensor * output);
+};
+}
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -23,35 +23,14 @@
 #define __T2TTRAINER_H__
 #include "T2TModel.h"
+#include "T2TBatchLoader.h"
 #include "../../tensor/function/FHeader.h"
-#define MAX_SEQUENCE_LENGTH 1024 * 4
 using namespace nts;
 namespace transformer
 {
-/* node to keep batch information */
-struct BatchNode
-{
-    /* begining position */
-    int beg;
-    /* end position */
-    int end;
-    /* maximum word number on the encoder side */
-    int maxEnc;
-    /* maximum word number on the decoder side */
-    int maxDec;
-    /* a key for sorting */
-    int key;
-};
 /* trainer of the T2T model */
 class T2TTrainer
 {
@@ -62,42 +41,6 @@ public:
    /* parameter array */
    char ** argArray;
-    /* buffer for loading words */
-    int * buf;
-    /* another buffer */
-    int * buf2;
-    /* batch buf */
-    BatchNode * bufBatch;
-    /* buffer size */
-    int bufSize;
-    /* size of batch buffer */
-    int bufBatchSize;
-    /* length of each sequence */
-    int * seqLen;
-    /* another array */
-    int * seqLen2;
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-    /* number of sequences in the buffer */
-    int nseqBuf;
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-    /* offset for next batch */
-    int nextBatch;
-    /* indicates whether the sequence is sorted by length */
-    bool isLenSorted;
    /* dimension size of each inner layer */
    int d;
@@ -139,10 +82,10 @@ public:
    float adamBeta2T;
    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;
    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;
    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
@@ -159,25 +102,14 @@ public:
    /* number of batches on which we do model update */
    int updateStep;
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-    /* randomize batches */
-    bool isRandomBatch;
    /* indicates whether we intend to debug the net */
    bool isDebugged;
-    /* bucket size */
+    /* indicates whether the sequence is sorted by length */
-    int bucketSize;
+    bool isLenSorted;
+    /* for batching */
+    T2TBatchLoader batchLoader;
 public:
    /* constructor */
@@ -198,46 +130,6 @@ public:
    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-    /* clear data buffer */
-    void ClearBuf();
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold, XTensor * label,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
+#include "T2TPredictor.h"
+#include "T2TTester.h"
 #include "../../tensor/XDevice.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
@@ -47,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)
    ShowParams(argc, args);
+    bool isBeamSearch = false;
    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
@@ -56,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "model", modelFN, "");
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);
    srand((unsigned int)time(NULL));
@@ -65,27 +69,34 @@ int TransformerMain(int argc, const char ** argv)
    T2TModel model;
    model.InitModel(argc, args);
-    //if(strcmp(modelFN, ""))
-    //    model.Read(modelFN);
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    /* save the final model */
-    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        //model.Dump(modelFN);
+        model.Dump(modelFN);
    /* load the model if neccessary */
-    //if(strcmp(modelFN, ""))
+    if(strcmp(modelFN, ""))
-        //model.Read(modelFN);
+        model.Read(modelFN);
+    /* test the model on the new data */
+    if(strcmp(testFN, "") && strcmp(outputFN, "")){
+        /* beam search */
+        if(isBeamSearch){
+            T2TTester searcher;
+            searcher.Init(argc, args);
+            searcher.Test(testFN, outputFN, &model);
+        }
+        /* forced decoding */
+        else{
            T2TTrainer tester;
            tester.Init(argc, args);
-    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
            tester.Test(testFN, outputFN, &model);
+        }
+    }
    delete[] trainFN;
    delete[] modelFN;

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -201,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
    cudaError_t error = cudaSetDevice(devID);
    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
 #else
@@ -216,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
    SetFastFlags();
 }
-/* switch to a get current dev */
+/* get the id of the current GPU device */
 int XDevice::GetGPUDevice()
 {
 #ifdef USE_CUDA
@@ -224,7 +225,8 @@ int XDevice::GetGPUDevice()
    cudaError_t error = cudaGetDevice(&devID);
    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
@@ -248,7 +250,7 @@ void XDevice::SetFastFlags()
 #endif
 }
-/* reset cuda flag for more efficient cuda execution (all devices) */
+/* reset the cuda flag for more efficient cuda execution (all devices) */
 void XDevice::SetFastFlagsAllDevices()
 {
 #ifdef USE_CUDA
@@ -266,10 +268,6 @@ XDevManager::XDevManager()
 {
    Clear();
    Init();
-#ifndef USE_CPP11
-    fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
-#endif
 }
 /* de-constructor */
@@ -278,7 +276,7 @@ XDevManager::~XDevManager()
 }
-/* initialize it and get the CPU and GPU information */
+/* initialization */
 void XDevManager::Init()
 {
    srand((unsigned int)time(NULL));
@@ -322,7 +320,7 @@ void XDevManager::Clear()
 #ifdef USE_CUDA
-/* get the handle of GPU */
+/* get the handle of a given GPU */
 cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -330,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }
-/* get the stream of cuda */
+/* get the stream of a given GPU */
 cudaStream_t * XDevManager::GetCudaStream(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -478,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();
@@ -527,12 +525,12 @@ get device ids for the given device information
             devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
             means that the first device is CPU, the second device
             is GPU-0, the third device is CPU.
->> devIDs - device sequence specified by devInfo
+>> devIDs - device IDs specified by devInfo
 << return - number of devices
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);
    for(int i = 0; i < terms->count; i++){
@@ -569,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
    return devCount;
 }
-/* show id sequence */
+/* show device IDs */
 void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
 {
    msg[0] = 0;

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -236,6 +236,18 @@ extern XDevManager GDevs;
        cudaSetDevice(devIDBackup); \
 } \
+#define CheckDev(a, b) \
+{ \
+    if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+    else if (a >= 0 && b >= 0 && a != b) { \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+} \
 } /* end of the nts (NiuTrans.Tensor) namespace */
 #endif
--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -32,8 +32,6 @@
 #ifndef WIN32
 #include <sys/time.h>
 #include <unistd.h>
-#include <stdint.h>
-typedef int8_t __int8;
 #endif
 // the CUDA stuff
@@ -45,10 +43,6 @@ typedef int8_t __int8;
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {
-#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
-#define USE_CPP11
-#endif
 #define _XINLINE_  
 //#define DOUBELPRICSION
@@ -159,7 +153,9 @@ extern bool useCUDA;
 #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
 #define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}
-#define B2I(V) V==0?false:true
+#define B2I(V) V == 0 ? false : true
+#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)
 /* BLAS interfaces */
 #ifdef DOUBELPRICSION

--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
@@ -31,15 +31,15 @@ namespace nts{
 /* constructor */
 template<HeapType hType, typename T>
+XHeap<hType, T>::XHeap()
+{
+}
+/* constructor */
+template<HeapType hType, typename T>
 XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
 {
-    mem = myMem;
+    Init(mySize, myMem);
-    size = mySize;
-    count = 0;
-    if (mem == NULL)
-        items = new HeapNode<T>[mySize];
-    else
-        mem->Alloc(mem->devID, mySize * sizeof(T));
 }
 /* deconstructor */
@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
 }
 template<HeapType hType, typename T>
+void XHeap<hType, T>::Init(int mySize, XMem * myMem)
+{
+    mem = myMem;
+    size = mySize;
+    count = 0;
+    if (mem == NULL)
+        items = new HeapNode<T>[mySize];
+    else
+        mem->Alloc(mem->devID, mySize * sizeof(T));
+}
+template<HeapType hType, typename T>
 void XHeap<hType, T>::Clear(T initValue)
 {
    count = 0;
@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
 template<HeapType hType, typename T>
 _XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
 {
-    //CheckNTErrors((count < size), "Heap is full!");
+    if (count < size) {
        items[count] = node;
        Up(count);
        count++;
+    }
+    else if(count == size){
+        HeapNode<T> & item0 = items[0];
+        if (hType == MIN_HEAP && item0.value >= node.value)
+            return;
+        else if (hType == MAX_HEAP && item0.value <= node.value)
+            return;
+        items[0] = node;
+        Down(0);
+    }
+    else {
+        ShowNTErrors("Overflow of the heap!");
+    }
 }
 /* replace the top-most item and update the heap */
@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
 template<HeapType hType, typename T>
 _XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
 {
-    //CheckNTErrors((size > 0), "Empty heap!");
+    CheckNTErrors(count > 0, "Empty heap!");
    HeapNode<T> node = items[0];
    items[0] = items[count - 1];
    count--;

--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
@@ -39,7 +39,7 @@ template <typename T>
 struct HeapNode
 {
    /* node index */
-    int index;
+    long long index;
    /* value of the node */
    T value;
@@ -52,9 +52,16 @@ struct HeapNode
    HeapNode(int i, T v)
    {
-        index = i;
+        index = (long long)i;
        value = v;
    };
+    HeapNode(void * i, T v)
+    {
+        index = (long long)i;
+        value = v;
+    }
 };
 /* a heap that keeps a data array of T */
@@ -76,11 +83,17 @@ public:
 public:
    /* constructor */
+    XHeap();
+    /* constructor */
    XHeap(int mySize, XMem * myMem = NULL);
    /* deconstructor */
    ~XHeap();
+    /* initialization */
+    void Init(int mySize, XMem * myMem = NULL);
    /* clear the data */
    void Clear(T initValue);
@@ -107,6 +120,9 @@ public:
    /* move item k up the tree */
    void Up(int k);
+    /* how many items are kept in the heap */
+    inline int Count() { return count; };
 };
 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
-    XList list(2);
+    TensorList list(2);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
    MakeLink(&list, h, id);
 }
@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
    if (h == NULL)
        return;
-    XList list(3);
+    TensorList list(3);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
-    list.Add(t3);
+    list.Add((XTensor*)t3);
    MakeLink(&list, h, id);
 }
@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
    /* forward */
    for(int i = 0; i < list->count; i++){
@@ -624,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);
    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);
@@ -743,7 +743,7 @@ search for a node in a top-down manner by its name
 >> top - the top most node
 << return - the node we found
 */
-/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
+XTensor * XLink::SearchNode(XTensor * top, const char * name)
 {
    if(!strcmp(top->name, name))
        return top;
@@ -758,7 +758,7 @@ search for a node in a top-down manner by its name
    }
    return NULL;
-}*/
+}
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -138,17 +138,17 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
-    /* create a hyper edge with two input tensors and a output tensor */
+    /* create a hyper edge with three input tensors and a output tensor */
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);
    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);
    /* add a parameter */
    static
@@ -191,8 +191,8 @@ struct XLink
    void ShowNode(FILE * file, XTensor * node);
    /* search a node in a top-down manner by its name */
-    //static
+    static
-    //XTensor * SearchNode(XTensor * top, const char * name);
+    XTensor * SearchNode(XTensor * top, const char * name);
 };
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */
-/*
+ /*
  *
- * Implementation of list that keeps data items
+  * Implementation of template list that keeps data items
  *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
+  * $Created by: HU Chi (huchinlp@foxmail.com)
- * The first coding job this year!
  *
  */
-#ifndef __XLIST_H__
-#define __XLIST_H__
 #include "XMem.h"
 #include "XGlobal.h"
-/* the nts (NiuTrans.Tensor) namespace */
+#ifndef __TensorList_H__
-namespace nts{
+#define __TensorList_H__
-typedef int (* ListCompare)(const void * item1, const void * item2);
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
-/* the XList class */
+/* the TensorListBase class */
-class XList
+template <typename T>
-{
+struct TensorListBase {
 public:
    /* data items */
-    void ** items;
+    T *items;
    /* number of items */
    int count;
@@ -49,56 +48,88 @@ public:
    int maxNum;
    /* the memory pool for data array allocation */
-    XMem * mem;
+    XMem* mem;
-    /* indicates whether data items are integers */
-    bool isIntList;
 public:
    /* constructor */
-    XList();
+    TensorListBase();
    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);
    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);
    /* de-constructor */
-    ~XList();
+    ~TensorListBase();
-    /* utilities */
+    /* add an item into the list */
-    void Create(int myMaxNum, XMem * myMem);
+    void Add(T&& item);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
+	/* add an item into the list */
-    void AddList(XList * l);
+	void Add(const T& item);
-    void AddInt(int i);
-    void Insert(int pos, void * item);
+	/* add a number of items into the list */
-    void * GetItem(int i) const;   
+    void Add(T* inputItems, int inputItemCount);
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
+	/* append a list to the current list */
-    void SetItemInt(int i, int item);
+    void AddList(TensorListBase* l);
-    int FindFirst(void * item);
+	/* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+	/* insert an item to the given position of the list */
+	void Insert(int pos, T&& item);
+	/* get the item at position i */
+    T& GetItem(int i) const;
+	/* set the item at position i */
+    void SetItem(int i, const T& item);
+	/* set the item at position i */
+	void SetItem(int i, T&& item);
+	/* find the position of the first matched item  */
+    int FindFirst(const T& item);
+	/* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+	/* sort the list */
+    void Sort(int itemSize);
+	/* reverse the list */
    void Reverse();
+	/* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+	/* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+	/* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);
    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
+	T& operator[] (int i) {
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
+		return GetItem(i);
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
+	};
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
+    T& Get(int i) { return GetItem(i); };
+	void Set(int i, T item) { SetItem(i, item); };
 };
-extern XList NULLList;
+struct XTensor;
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<XTensor*> TensorList;
-} 
+} /* end of the nts (NiuTrans.Tensor) namespace */
-/* end of the nts (NiuTrans.Tensor) namespace */
-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -34,6 +34,11 @@ namespace nts{
 int testxmemid = 0;
 void * recordp = NULL;
+/*
+for managing the memories
+*/
+XMemManager GMems;
 XMem * GMem;
 /* constructor */
@@ -48,6 +53,7 @@ XMem::XMem()
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    isInitialized = false;
 }
 /* 
@@ -58,7 +64,7 @@ constructor
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -103,7 +109,7 @@ initialize it
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -164,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
 #endif
    signature++;
+    isInitialized = true;
 }
 /* free memory */
@@ -217,8 +224,8 @@ void XMem::Free(int myDevID, void * mem)
 }
 /*
-get signature 
+get the signature
-<< return - return the signature
+<< return - the signature
 */
 MTYPE XMem::GetSignature()
 {
@@ -226,7 +233,7 @@ MTYPE XMem::GetSignature()
 }
 /* 
-use string as the name of the memory pool 
+set the name of the memory pool 
 >> myName - name of the memory pool
 */
 void XMem::SetName(const char * myName)
@@ -259,7 +266,7 @@ void XMem::SetDevice(int myDevID)
 }
 /* 
-switch to the device (with fast cuda execution mode) we want to work 
+switch to the device (with fast cuda execution mode) we intend to work on
 >> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
 */
 void XMem::SetDeviceFast(int myDevID)
@@ -275,7 +282,7 @@ void XMem::SetDeviceFast(int myDevID)
 }
 /* 
-run in static mode 
+run in the static mode
 >> myIsStatic - specify if the memory allocation is static
 */
 void XMem::SetStaticMode(bool myIsStatic)
@@ -1488,4 +1495,179 @@ cublasHandle_t * XMem::GetCublasHandle()
 #endif
+/* constructor */
+XMemManager::XMemManager()
+{
+    Initialize();
+}
+/* de-constructor */
+XMemManager::~XMemManager()
+{
+}
+/* get memory size */
+MTYPE XMemManager::GetAvailableMemory()
+{
+    unsigned long freeMem = 0;
+#if __APPLE__
+    int mib[2] = {CTL_HW, HW_MEMSIZE};
+    unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
+    unsigned long long size;
+    size_t len = sizeof(size);
+    if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
+        ShowNTErrors("Cannot get memory size on Mac!");
+    }
+    else{
+        return size;
+    }
+#elif _WIN32
+    MEMORYSTATUSEX memoryStatus;
+    memoryStatus.dwLength = sizeof(memoryStatus);
+    if (GlobalMemoryStatusEx(&memoryStatus)){
+        freeMem = memoryStatus.ullAvailPhys;
+    }
+#else
+    long pages = sysconf(_SC_AVPHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    freeMem = pages * page_size;
+#endif
+    return (MTYPE)freeMem;
+}
+/* get GPU memory size */
+MTYPE XMemManager::GetAvailableGPUMemory(int devID)
+{
+    size_t freeMem = 0;
+#ifdef USE_CUDA
+    size_t totalMem = 0;
+    cudaSetDevice(devID);
+    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
+        XPRINT(0, stderr, "cannot get GPU memory information.");
+        exit(1);
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+/* get buffer size */
+void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
+{
+    *myBufSize = 0;
+    if (freeMem >= MILLION * 128){
+        *myBufSize = MILLION * 32;
+        if (freeMem >= MILLION * 256){
+            *myBufSize = MILLION * 64;
+            if (freeMem >= MILLION * 512){
+                *myBufSize = MILLION * 128;
+                if (freeMem >= MILLION * 1024) {
+                    *myBufSize = MILLION * 256;
+                    if (freeMem >= MILLION * 2048)
+                        *myBufSize = MILLION * 512;
+                }
+            }
+        }
+    }
+} 
+/* initialize it and set the global memory information */
+void XMemManager::Initialize()
+{
+    srand((unsigned int)time(NULL));
+    Free();
+    /* CPUs (we actually do not care about how many CPUs are using) */
+    nCPUMem = 1;
+    /* GPUs */
+    nGPUMem = 0;
+#ifdef USE_CUDA
+    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
+        XPRINT(0, stderr, "cannot get GPU information.");
+        exit(1);
+    }
+#endif
+}
+/* free it */
+void XMemManager::Free()
+{
+    for (int i = 0; i < MAX_CPU_NUM; i++)
+        CPUMems[i].Free();
+    for (int i = 0; i < MAX_GPU_NUM; i++)
+        GPUMems[i].Free();
+}
+/* get global memory pool */
+XMem * XMemManager::GetMem(const int devID)
+{
+    XMem * mem = NULL;
+    if (devID < 0){
+        if(!CPUMems[0].isInitialized){
+            MTYPE freeMem = GetAvailableMemory();
+            MTYPE myBufSize = 0;
+            GetBufferSize(freeMem, &myBufSize);
+            CPUMems[0].Initialize(-1, FREE_ON_THE_FLY, 
+                                  MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                  MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                  myBufSize);
+        }
+        mem = CPUMems;
+    }
+    else{
+        if (devID < nGPUMem){
+            if(!GPUMems[devID].isInitialized){
+                MTYPE freeMem = GetAvailableGPUMemory(devID);
+                MTYPE myBufSize = 0;
+                GetBufferSize(freeMem, &myBufSize);
+                GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY, 
+                                          MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                          MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                          myBufSize);
+            }
+            mem = GPUMems + devID;
+        }
+        else{
+            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+        }
+    }
+    return mem;
+}
+/* get global memory size */
+int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
+{
+    XMem * mem = GetMem(devID);
+    int result = 0;
+    if (mem != NULL){
+        *myBlockSize = mem->maxBlockSize;
+        *myBlockNum = mem->blockNum;
+        *myBufSize = mem->bufSize;
+        result = 1;
+    }
+    return result;
+}
+/* show memory information */
+void XMemManager::ShowMemInfo()
+{
+    XPRINT(1, stderr, "Memory Information:\n");
+    MTYPE myBlockSize, myBufSize;
+    int myBlockNum;
+    for(int i = 0; i < nCPUMem; i++){
+        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
+    }
+    for(int i = 0; i < nGPUMem; i++){
+        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
+    }
+}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -39,6 +39,15 @@
 #include <curand.h>
 #endif
+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
@@ -53,6 +62,8 @@ typedef long long          INT_64;
 #define BUF_PITCH 256
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_NUM 16
+#define MAX_GPU_NUM 16
 /* 
 mode of runnig a memory pool 
@@ -202,6 +213,9 @@ public:
    MTYPE curUsedPin;
    MTYPE bufUsedPin;
+    /* indicates whether the memory pool is initialized */
+    bool isInitialized;
 #ifdef USE_CUDA
    /* handle used for cublas */
    cublasHandle_t cublasHandle;
@@ -413,6 +427,61 @@ public:
 };
+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+private:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_NUM];
+    /* number of cpu memory pools */
+    int nCPUMem;
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_NUM];
+    /* number of gpu memory pools */
+    int nGPUMem;
+public:
+    /* constructor */
+    XMemManager();
+    /* de-constructor */
+    ~XMemManager();
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+    /* initialize it and set the global memory information */
+    void Initialize();
+    /* free it */
+    void Free();
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+    /* show memory information */
+    void ShowMemInfo();
+};
+/* managing the memories */
+extern XMemManager GMems;
 extern XMem * GMem;
 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -108,18 +108,10 @@ const char * GetOPName(int type)
        else if (type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
    }
-	else if ((type & DATA_BASE) != 0) {
+    else if ((type & DATA_BASE) != 0){
-		if (type == GETANDSET_CONVERTDATATYPE)
+        if (type == GETANDSET_SELECT)
-			return "G_CONVERTDATATYPE";
-		else if (type == GETANDSET_INDEXTOONEHOT)
-			return "G_INDEXTOONEHOT";
-		else if (type == GETANDSET_ONEHOTTOINDEX)
-			return "G_ONEHOTTOINDEX";
-		else if (type == GETANDSET_SELECT)
            return "G_SELECT";
-	}
+        else if (type == MOVEMENT_COPYINDEXED)
-	else if ((type & SHAPE_BASE) != 0) {
-        if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -79,13 +79,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* data and shape related operations */
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
-#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
+#define GETANDSET_SELECT        GETANDSET + 1
-#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
-#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
-#define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1
-#define SHAPE_BASE              DATA_BASE * 2
+#define MOVEMENT                GETANDSET_SELECT + 1
-#define MOVEMENT                SHAPE_BASE + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
@@ -108,7 +104,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SORT_TOPK               SORT_SORT + 1
 /* activation functions */
-#define FUNCTION_BASE           SHAPE_BASE * 2
+#define FUNCTION_BASE           DATA_BASE * 2
 #define FUNC_DROPOUT            FUNCTION_BASE + 1
 #define FUNC_HARDTANH           FUNC_DROPOUT + 1
 #define FUNC_IDENTITY           FUNC_HARDTANH + 1

--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();
    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }
 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+	// warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }
 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }
 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");

--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;
    /* arguments of the job */
-    XList * args;
+    TensorList * args;
 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;
    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;
    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();
    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);
    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);
    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -52,6 +52,7 @@ struct XLink;
 #define MIN_TENSOR_MERGE_NUM 0
 #define MIN_TENSOR_MERGE_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8
+#define MAX_TENSOR_NAME_SIZE 32
 /* computation flags */
 #define UNSAFE_BUT_FAST_MEM
@@ -61,6 +62,9 @@ struct XLink;
 struct XTensor
 {
 public:
+    /* name */
+    char name[MAX_TENSOR_NAME_SIZE];
    /* id */
    int id;
@@ -190,13 +194,14 @@ public:
    XTensor(const XTensor &reference);
    /* copy constructor (with right value reference) */
-    #ifdef USE_CPP11
    XTensor(const XTensor &&reference);
-    #endif
    /* de-constructor */
    ~XTensor();
+    /* set the name of the tensor */
+    void SetName(const char * myName);
    /* initialize member variables */
    void Init();
@@ -210,9 +215,7 @@ public:
    XTensor& operator= (const XTensor &tensor);
    /* overloading of the equal-sign (with right value reference) */
-    #ifdef USE_CPP11
    XTensor& operator= (const XTensor &&tensor);
-    #endif
    /* overloading of the plus-sign */
    XTensor  operator+ (const XTensor &tensor) const;
@@ -241,6 +244,9 @@ public:
    /* linear transformation */
    XTensor Lin(DTYPE scale, DTYPE shift = 0) const;
+    /* relocate the data on the target device */
+    void SetDevice(int myDevId, XMem * myMem = NULL);
    /* judge whether the two matrices are in the same type and size */
    static
    bool IsSameShaped(const XTensor * a, const XTensor * b);
@@ -268,6 +274,9 @@ public:
    /* reshape the tensor to a matrix */
    void Reshape(const int rowNum, const int colNum);
+    /* reshape the tensor by merging two consecutive dimensions */
+    void ReshapeMerged(const int i, const int j = -1);
    /* get the number of items in the data array */
    int GetSize() const;
@@ -316,6 +325,9 @@ public:
    /* get the value of a cell with the index */
    DTYPE Get(int index[], int size = -1);
+    /* get the value of a cell with the offset */
+    DTYPE Get(int offset);
    /* get the pointer to a cell */
    void * GetCell(int index[], int size = -1) const;
@@ -328,6 +340,9 @@ public:
    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);
+    /* get the int value of a cell by its offset */
+    int GetInt(int offset);
    /* get the int value of a cell in a 1d tensor */
    int Get1DInt(int i);
@@ -346,6 +361,9 @@ public:
    /* set the value of a cell */
    bool Set(DTYPE value, int index[], int size = -1);
+    /* set the value of a cell with its offset in the array */
+    bool Set(DTYPE value, int offset);
    /* set the value of a cell in a 1d tensor */
    bool Set1D(DTYPE value, int i);
@@ -355,6 +373,9 @@ public:
    /* set the value of a cell in a 3d tensor */
    bool Set3D(DTYPE value, int d0, int d1, int d2);
+    /* set the integer value of a cell by its offset */
+    bool SetInt(int value, int offset);
    /* set the integer value of a cell */
    bool SetInt(int value, int index[], int size = -1);
@@ -430,65 +451,133 @@ void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1);
 /* initialize a dense vector */
 void InitTensor1D(XTensor * tensor, const int num, 
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense matrix */
 void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 3d tensor */
 void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 4d tensor */
 void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 5d tensor */
 void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a tensor with a reference tensor */
 void InitTensor(XTensor * tensor, const XTensor * reference);
+/* initialize a tensor with a reference tensor */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
+/* initialize a tensor on the CPU with a reference tensor */
+void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
+/* generate a XTensor with no initialization */
+XTensor * NewTensor();
 /* generate a XTensor */
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const int myDevID = -1);
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
                       const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID);
 /* generate a dense vector */
 XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
                      XMem * myMem = NULL);
+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* generate a dense matrix */
 XTensor * NewTensor2D(const int rowNum, const int colNum, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 3d tensor */
 XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 4d tensor */
 XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 5d tensor */
 XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a copy of XTensor (with a reference to a given tensor) */
 XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{
 #endif
-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);
 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:
    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;
    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -97,4 +97,5 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
+#include "../function/DropoutWithIndex.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -21,6 +21,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -41,12 +42,15 @@ where i is the index of the item
 */
 void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");
+    CheckDev(a->devID, b->devID);
+    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);

--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -19,10 +19,12 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
 */
+#include <math.h>
 #include "Div.h"
 #include "DivDim.h"
 #include "DivDim.cuh"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting
 */
 void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
 {
+    n = MODX(n, a->order);
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckDev(a->devID, b->devID);
    if(XTensor::IsSameShaped(a, b)){
        _Div(a, b, c, alpha);
        return;
@@ -152,6 +158,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    XTensor c(&a);
    c.SetTMPFlag();
+    n = MODX(n, a.order);
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
@@ -17,7 +17,6 @@
 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
- * $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
 */
 #include "DivDim.cuh"
@@ -169,34 +168,6 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
            ShowNTErrors("Something is wrong!");
        }
    }
-    else if (a->dataType == X_FLOAT16) {
-        half alpha1 = __float2half(alpha);
-        if (stride > 1){
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            if (alpha == (DTYPE)0.0F)
-                KernelDivWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
-                                                   blockSize, stride, blockSize * stride, blockNum, alpha1);
-            else
-                KernelDivWithCol<__half, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
-                                                   blockSize, stride, blockSize * stride, blockNum, alpha1);
-        }
-        else if (stride == 1){
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (alpha == (DTYPE)0.0F)
-                KernelDivWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
-                                                   blockNum, blockSize, alpha1);
-            else
-                KernelDivWithRow<__half, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
-                                                   blockNum, blockSize, alpha1);
-        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-    }
    else {
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -16,10 +16,10 @@
 */
 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
-* I'll attend several conferences and workshops in the following weeks -
+ * I'll attend several conferences and workshops in the following weeks -
-* busy days :(
+ * busy days :(
-*/
+ */
 #ifndef __MASK_H__
 #define __MASK_H__
@@ -55,4 +55,3 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -54,6 +54,8 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a && b && c, "Empty input tensors!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
                  "Input tensors must have a order >= 2!");
    CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
@@ -106,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cBlockNum *= b->dimSizeRDI[i];
    }
-    XList * aList = new XList(10);
+    TensorList * aList = new TensorList(10);
-    XList * bList = new XList(10);
+    TensorList * bList = new TensorList(10);
-    XList * cList = new XList(10);
+    TensorList * cList = new TensorList(10);
    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,17 +38,23 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void _MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(TensorList * args)
 {
-    int x1 = *(int*)args->GetItem(0);
+	CheckNTErrors(args->count == 2, "invalid argument number!");
-    int y1 = *(int*)args->GetItem(1);
+	IntList * indexArgs = (IntList*)args->GetItem(0);
-    int x2 = *(int*)args->GetItem(2);
+	TensorList * matrixArgs = (TensorList*)args->GetItem(1);
-    int y2 = *(int*)args->GetItem(3);
+	CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
-    XTensor * a = (XTensor*)args->GetItem(4);
+	CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
-    XTensor * b = (XTensor*)args->GetItem(5);
-    XTensor * c = (XTensor*)args->GetItem(6);
+    XTensor * a = matrixArgs->GetItem(0);
-    DTYPE alpha = *(DTYPE*)args->GetItem(7);
+    XTensor * b = matrixArgs->GetItem(1);
-    DTYPE beta = *(DTYPE*)args->GetItem(8);
+    XTensor * c = matrixArgs->GetItem(2);
+    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
+    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
+	int x1 = indexArgs->GetItem(0);
+	int y1 = indexArgs->GetItem(1);
+	int x2 = indexArgs->GetItem(2);
+	int y2 = indexArgs->GetItem(3);
 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-void _MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(TensorList * args);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          const TensorList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
+                          TensorList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty input lists!");
    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
 optimized for GPU
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+                          TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 /*
 matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -21,6 +21,7 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 #include "MultiplyDim.h"
@@ -41,12 +42,15 @@ where i is the index of the item
 */
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");
+    CheckDev(a->devID, b->devID);
+    int leadingDimRDI = a->order - leadingDim - 1;
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);

--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -19,6 +19,7 @@
 * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
 */
+#include <math.h>
 #include "Multiply.h"
 #include "MultiplyDim.h"
 #include "MultiplyDim.cuh"
@@ -42,7 +43,9 @@ i.e., a is multiplied with b by broadcasting
 >> n - the dimension index
 >> alpha - the scaling factor
 */
-void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
+void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) 
+{
+    n = MODX(n, a->order);
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
@@ -52,6 +55,8 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckDev(a->devID, b->devID);
    if(XTensor::IsSameShaped(a, b)){
        _Multiply(a, b, c, alpha);
        return;
@@ -151,6 +156,8 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
    XTensor c(&a);
    c.SetTMPFlag();
+    n = MODX(n, a.order);
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -44,6 +44,8 @@ void _Sub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Unmatched tensors in addition!");
+    CheckDev(a->devID, b->devID);
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -19,10 +19,12 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-13
 */
+#include <math.h>
 #include "Sub.h"
 #include "SubDim.h"
 #include "SubDim.cuh"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,6 +44,8 @@ i.e., a is subtracted with b by broadcasting
 */
 void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
 {
+    n = MODX(n, a->order);
 	CheckNTErrors(a && b && c, "Empty tensor input!");
 	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
 	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -50,6 +54,8 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
 	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
 	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckDev(a->devID, b->devID);
 	if (beta == 0) {
 		_CopyValues(a, c);
 		return;
@@ -152,6 +158,8 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 	XTensor c(&a);
 	c.SetTMPFlag();
+    n = MODX(n, a.order);
 	/* call _Sub function */
 	_SubDim(&a, &b, &c, n, beta);

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -45,6 +45,8 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                  "Unmatched tensors in addition!");
+    CheckDev(a->devID, b->devID);
    if(beta == 0){
        _CopyValues(a, c);
        return;

--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -21,6 +21,7 @@
 * Add summation by broadcasting.
 */
+#include <math.h>
 #include "Sum.h"
 #include "SumDim.h"
 #include "SumDim.cuh"
@@ -46,6 +47,8 @@ i.e., a is summed with b by broadcasting
 */
 void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
 {
+    n = MODX(n, a->order);
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -54,6 +57,8 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckDev(a->devID, b->devID);
    if(beta == 0){
        _CopyValues(a, c);
        return;
@@ -170,6 +175,8 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    XTensor c(&a);
    c.SetTMPFlag();
+    n = MODX(n, a.order);
    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
-void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const TensorList * a, MATRIX_TRANS_TYPE transposedA, 
-                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+                            const TensorList * b, MATRIX_TRANS_TYPE transposedB, TensorList * c,
                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 #endif

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -20,21 +20,20 @@
 */
 #include "../../XTensor.h"
-#include "../../XName.h"
 #include "ConvertDataType.h"
 #include "ConvertDataType.cuh"
-#include "../movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 convert data type
 >> input - input tensor
 >> output - output tensor
 */
 void _ConvertDataType(const XTensor * input, XTensor * output)
 {
+    //CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
    if (input->dataType == output->dataType)
        return;
@@ -62,35 +61,4 @@ void _ConvertDataType(const XTensor * input, XTensor * output)
        ShowNTErrors("Unsupported data types for conversion!");
 }
-/*
-convert data type (return an XTensor structure) 
-make a new tensor to keep the result and return it
->> input - input tensor
-<< return - output tensor with the specified data type
-*/
-XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType)
-{
-    if (input.dataType == dataType) {
-        XTensor output;
-        output = CopyValues(input);
-        return output;
-    }
-    int order = input.order;
-    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    XTensor output(order, input.dimSize, dataType, dr, input.devID, input.mem);
-    output.SetTMPFlag();
-    _ConvertDataType(&input, &output);
-    /* tensor connection */
-    XLink::MakeLink(&input, NULL, &output, GETANDSET_CONVERTDATATYPE);
-    return output;
-}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -67,49 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
    if (i < size){
        outputData[i] = (float)(inputData[i]);
-    }
+    }}
-}
-__global__
-void KernelFloatToInt8(float * inputData, __int8 * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (__int8)(inputData[i]);
-    }
-}
-__global__
-void KernelInt8ToFloat(__int8 * inputData, float * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (float)(inputData[i]);
-    }
-}
-__global__
-void KernelIntToInt8(int * inputData, __int8 * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (__int8)(inputData[i]);
-    }
-}
-__global__
-void KernelInt8ToInt(__int8 * inputData, int * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (int)(inputData[i]);
-    }
-}
 /* 
 data conversion (cuda code) 
@@ -180,14 +138,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
        KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
    else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
        KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
-    else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
-        KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (__int8*)output->data, input->unitNum);
-    else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
-        KernelInt8ToFloat << <blocks, threads >> >((__int8*)input->data, (float*)output->data, input->unitNum);
-    else if (input->dataType == X_INT && output->dataType == X_INT8)
-        KernelIntToInt8 << <blocks, threads >> >((int*)input->data, (__int8*)output->data, input->unitNum);
-    else if (input->dataType == X_INT8 && output->dataType == X_INT)
-        KernelInt8ToInt << <blocks, threads >> >((__int8*)input->data, (int*)output->data, input->unitNum);
    else{
        ShowNTErrors("Unsupported data types for conversion!");
    }

--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
@@ -23,16 +23,12 @@
 #define __CONVERTDATATYPE_H__
 #include "../../XTensor.h"
-#include "../../XDataType.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* convert data type */
 void _ConvertDataType(const XTensor * input, XTensor * output);
-/* convert data type (return an XTensor structure) */
-XTensor ConvertDataType(const XTensor & input, TENSOR_DATA_TYPE dataType);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __CONVERTDATATYPE_H__
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -21,7 +21,6 @@
 #include "OnehotAndIndex.h"
 #include "OnehotAndIndex.cuh"
-#include "SetData.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -32,65 +31,43 @@ convert onehot tensor to index tensor
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim)
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
 {
-    dim = (dim < 0 ? onehot->GetDim(-1) : dim);
+    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
-    CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
-    for (int i = 0; i < index->order; i++) {
+    for (int i = 0; i < index->order; i++)
-        if (i < dim) {
        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
-        }
-        else {
-            CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
-        }
-    }
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaOnehotToIndex(onehot, index, dim);
+        _CudaOnehotToIndex(onehot, index, size);
        return;
    }
 #endif
-    int blockNum = 1;
+    int blockNum = index->unitNum;
-    int blockSize = 1;
+    int stride = size;
-    int dimSize = 1;
-    int stride = 1;
-    for (int i = 0; i < dim; i++)
-        blockNum *= onehot->GetDim(i);
-    blockSize = onehot->unitNum / blockNum;
-    dimSize = onehot->GetDim(dim);
-    for (int i = dim + 1; i < onehot->order; i++)
-        stride *= onehot->GetDim(i);
    int * onehotData = (int *)onehot->data;
    int * indexData = (int *)index->data;
    for (int i = 0; i < blockNum; i++) {
-        for (int j = 0; j < stride; j++) {
+        int * od = onehotData + i * stride;
-            int * od = onehotData + i * blockSize + j;
-            int * index = indexData + i * stride + j;
        int record = -1;
-            for (int j = 0; j < dimSize; j++) {
+        for (int j = 0; j < stride; j++) {
-                if (od[j*stride] != 0) {
+            if (od[j] != 0) {
                if (record == -1)
                    record = j;
                else
                    ShowNTErrors("The value of onehot tensor is illegal!");
            }
        }
-            *index = record;
+        indexData[i] = record;
-        }
    }
 }
 /* 
@@ -122,7 +99,7 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
@@ -134,12 +111,9 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
    onehot->SetZeroAll();
+#ifdef USE_CUDA
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
-    //_SetDataFixedFloat(onehot, lowconfidence);
-#ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
@@ -155,7 +129,7 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float lab
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
        DTYPE * od = onehotData + i * stride;
-        od[id] = confidence;
+        od[id] = 1;
    }
 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -110,8 +110,9 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int 
    DTYPE * od = onehotData + i * stride;
    int id = indexData[i];
-    //od[id] = confidence;
+    //od[id] = 2.0;
+    //onehotData[i * stride + id] = 0.1;
    if (offset == id)
        od[offset] = confidence;
    else{
@@ -126,7 +127,7 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;

--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -27,18 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* convert onehot tensor to index tensor */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim);
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
 /* convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor OnehotToIndex(XTensor & onehot, int size);
+XTensor OnehotToIndex(XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP = 0.0F);
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP = 0.0F);
+XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
@@ -19,7 +19,6 @@
 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
 * I'm surprised that I did not write this file till today.
-* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-22 float16 added
 */
 #ifndef __SETDATA_CUH__
@@ -29,9 +28,22 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* generate data items with a fixed value p (in int, float, float16, double) */
+/* generate data items with a fixed value p (in int) */
-template<class T>
+void _CudaSetDataFixedInt(XTensor * tensor, int p);
-void _CudaSetDataFixed(XTensor * tensor, T p);
+/* generate data items with a fixed value p (in float) */
+void _CudaSetDataFixedFloat(XTensor * tensor, float p);
+/* generate data items with a fixed value p (in double) */
+void _CudaSetDataFixedDouble(XTensor * tensor, double p);
+/* generate data items with a fixed value p (in float) only 
+   if the condition entry is non-zero */
+void _CudaSetDataFixedCondFloat(XTensor * tensor, XTensor * condition, float p);
+/* generate data items with a fixed value p (in int) only 
+   if the condition entry is non-zero */
+void _CudaSetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _CudaSetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
@@ -43,11 +55,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
 void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
-void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
+void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
 /* generate data items with a uniform distribution in [lower, upper] and set
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
-void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
 /* set the data with an array of offsets */
 void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);

--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -24,19 +24,35 @@
 #define __SETDATA_H__
 #include "../../XTensor.h"
-#include "SetData.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* generate data items with a xavier initialization */
 void _SetDataFanInOut(XTensor * tensor, DTYPE gain = 1.0F);
-///* generate data items with a fixed value p */
+/* generate data items with a fixed value p */
-//void _SetDataFixed(XTensor * tensor, void * valuePointer);
+void _SetDataFixed(XTensor * tensor, void * valuePointer);
 /* generate data items with a fixed value p (in default type) */
-template<class T>
+void SetDataFixed(XTensor &tensor, DTYPE p);
-void _SetDataFixed(XTensor * tensor, T value);
+/* generate data items with a fixed value p (in integer) */
+void SetDataFixedInt(XTensor &tensor, int p);
+/* generate data items with a fixed value p (in int) */
+void _SetDataFixedInt(XTensor * tensor, int p);
+/* generate data items with a fixed value p (in float) */
+void _SetDataFixedFloat(XTensor * tensor, float p);
+/* generate data items with a fixed value p (in double) */
+void _SetDataFixedDouble(XTensor * tensor, double p);
+/* generate data items with a fixed value p only if the condition entry is non-zero */
+void _SetDataFixedCond(XTensor * tensor, XTensor * condition, DTYPE p);
+/* generate data items with a fixed value p only if the condition entry is non-zero */
+void _SetDataFixedCondInt(XTensor * tensor, XTensor * condition, int p);
 /* set data items along with a given dimension (and keep the remaining items unchanged) */
 void _SetDataDim(XTensor * tensor, int beg, int len, int dim, DTYPE p);
@@ -48,11 +64,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
 void _SetDataLowTri(XTensor * tensor, DTYPE p, int shift);
 /* generate data items with a uniform distribution in [lower, upper] */
-void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
+void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
 /* generate data items with a uniform distribution in [lower, upper] and set 
   the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
-void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
+void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
 /* generate data items with a normal distribution with specified mean and standard deviation */
 void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation = 1.0F);

--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -170,14 +170,10 @@ SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
 #else
 /* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc)     \
+#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, origFunc)                    \
 void _funcName(const XTensor * a, XTensor * b, int num)                     \
 {                                                                           \
-    /* run it on GPUs */                                                    \
+    CheckNTErrors(a->devID < 0, "No GPU code is supported");                \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b, num);                                           \
-        return;                                                             \
-    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                "Input tensors should have the same data type!");           \
    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
@@ -187,14 +183,10 @@ void _funcName(const XTensor * a, XTensor * b, int num)                     \
        db[i] = (int)origFunc(d[i], num);                                   \
 }                                                                           \
-#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)         \
+#define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc)         \
 void _funcName(const XTensor * a, XTensor * b, float num)                   \
 {                                                                           \
-    /* run it on GPUs */                                                    \
+    CheckNTErrors(a->devID < 0, "No GPU code is supported");                \
-    if (a->devID >= 0) {                                                    \
-        _cudaFuncName(a, b, num);                                           \
-        return;                                                             \
-    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                "Input tensors should have the same data type!");           \
    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
@@ -228,34 +220,36 @@ void funcName(const XTensor &a, XTensor &b, float num)                      \
    _funcName(&a, &b, num);                                                 \
 }                                                                           \
-_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
-SIMPLE_BINARY_FUNCTION_ME_INT(Scale, _Scale)
+_SIMPLE_BINARY_FUNCTION_INT(_Scale, scale)
+SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
-_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
+_SIMPLE_BINARY_FUNCTION(_Scale, scale)
-SIMPLE_BINARY_FUNCTION_ME(Scale, _Scale)
+SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
 SIMPLE_BINARY_FUNCTION(Scale, _Scale)
-_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
+_SIMPLE_BINARY_FUNCTION_INT(_Descale, descale)
-SIMPLE_BINARY_FUNCTION_ME_INT(Descale, _Descale)
+SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
-_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
+_SIMPLE_BINARY_FUNCTION(_Descale, descale)
-SIMPLE_BINARY_FUNCTION_ME(Descale, _Descale)
+SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale)
-_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
+_SIMPLE_BINARY_FUNCTION_INT(_Shift, shift)
-SIMPLE_BINARY_FUNCTION_ME_INT(Shift, _Shift)
+SIMPLE_BINARY_FUNCTION_ME_INT(_Shift, _Shift)
 SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
-_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
+_SIMPLE_BINARY_FUNCTION(_Shift, shift)
-SIMPLE_BINARY_FUNCTION_ME(Shift, _Shift)
+SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift)
-_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
+_SIMPLE_BINARY_FUNCTION_INT(_Mod, mod)
-SIMPLE_BINARY_FUNCTION_ME_INT(Mod, _Mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
 SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)
 #endif
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -104,7 +104,7 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num)               \
                         ((int*)a->data, (int*)b->data, a->unitNum, num);   \
    }                                                                       \
    else {                                                                  \
-        ShowNTErrors("TODOhaha!");                                          \
+        ShowNTErrors("TODO!");                                              \
    }                                                                       \
                                                                            \
    BacktoCudaDev(a->devID, devIDBackup);                                   \

--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -17,7 +17,6 @@
 /*
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-08-03
-* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int/int8 added
 */
 #include "../../XDevice.h"
@@ -36,9 +35,8 @@ set each entry to its clip value (CUDA Kernel)
 >> upper - the upper border
 >> size - size of the data array
 */
-template <class T>
 __global__
-void KernelClip(T * a, T * b, T lower, T upper, int size)
+	void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
 	int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -53,6 +51,21 @@ void KernelClip(T * a, T * b, T lower, T upper, int size)
 }
 /*
+set each entry to its clip value with float16 data type value (CUDA Kernel)
+This is for float16 computation
+>> a - pointer to input data array
+>> b - pointer to output data array
+>> lower - the lower border
+>> upper - the upper border
+>> size - size of the data array
+*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
+{
+	return;
+}
+/*
 set each entry to its clip value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
@@ -79,22 +92,7 @@ void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 		KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
 	}
 	else if (a->dataType == X_FLOAT16) {
-        half lower1 = __float2half(lower);
+		KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
-        half upper1 = __float2half(upper);
-        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower1, upper1, a->unitNum);
-    }
-    else if (a->dataType == X_INT) {
-        int lower1 = (int)lower;
-        int upper1 = (int)upper;
-        KernelClip << <blocks, threads >> >((int *)a->data, (int *)b->data, lower1, upper1, a->unitNum);
-    }
-    else if (a->dataType == X_INT8) {
-        __int8 lower1 = (__int8)lower;
-        __int8 upper1 = (__int8)upper;
-        KernelClip << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, lower1, upper1, a->unitNum);
 	}
 	else {
 		ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
@@ -29,9 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* set each entry to its clip value (CUDA Kernel) */
-template <class T> 
 __global__
-void KernelClip(T * a, T * b, T lower, T upper, int size);
+void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size);
+/* set each entry to its clip value (CUDA Kernel) with float16 data type*/
+__global__
+void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size);
 /* set each entry to its clip value */
 void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper);

--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
@@ -28,17 +28,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* compare whether every entry is equal to the specified value (cuda kernel) */
+/* check whether every entry is equal to the given value (cuda version) */
-__global__
+void _CudaEqual(const XTensor * a, XTensor * b, DTYPE value);
-void KernelEqual(DTYPE * a, DTYPE * b, DTYPE * number);
-/* compare whether every entry is equal to the specified value (cuda version) */
+/* check whether every entry is not equal to the given value (cuda version) */
-void _CudaEqual(const XTensor * a, XTensor * b, DTYPE number);
+void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE value);
-/* compare whether every entry is not equal to the specified value (cuda kernel) */
-__global__
-void KernelNotEqual(DTYPE * a, DTYPE * b, DTYPE * number);
-/* compare whether every entry is not equal to the specified value (cuda version) */
-void _CudaNotEqual(const XTensor * a, XTensor * b, DTYPE number);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
@@ -26,23 +26,23 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-/* compare whether every entry is equal to the specified value */
+/* check whether every entry is equal to the given value */
-void _Equal(const XTensor * a, XTensor * b, DTYPE number);
+void _Equal(const XTensor * a, XTensor * b, DTYPE value);
-/* compare whether every entry is equal to the specified value (do it on site)
-keep the result in the input tensor a and return nothing */
+/* check whether every entry is equal to the given value (do it on site) */
-void _EqualMe(XTensor * a, DTYPE number);
+void _EqualMe(XTensor * a, DTYPE value);
-/* compare whether every entry is equal to the specified value (return an XTensor structure)
-make a new tensor to keep the result and return it */
+/* check whether every entry is equal to the given value (return an XTensor structure) */
-XTensor Equal(const XTensor & a, DTYPE number);
+XTensor Equal(const XTensor & a, DTYPE value);
-/* compare whether every entry is not equal to the specified value */
+/* check whether every entry is not equal to the given value */
-void _NotEqual(const XTensor * a, XTensor * b, DTYPE number);
+void _NotEqual(const XTensor * a, XTensor * b, DTYPE value);
-/* compare whether every entry is not equal to the specified value (do it on site)
-keep the result in the input tensor a and return nothing */
+/* check whether every entry is not equal to the given value (do it on site) */
-void _NotEqualMe(XTensor * a, DTYPE number);
+void _NotEqualMe(XTensor * a, DTYPE value);
-/* compare whether every entry is not equal to the specified value (return an XTensor structure)
-make a new tensor to keep the result and return it */
+/* check whether every entry is not equal to the given value (return an XTensor structure) */
-XTensor NotEqual(const XTensor & a, DTYPE number);
+XTensor NotEqual(const XTensor & a, DTYPE value);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -138,12 +138,12 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
    /* tensor connections */
-    XList list(5);
+    TensorList list(5);
-    list.Add(&input);
+    list.Add((XTensor*)&input);
-    list.Add(&mean);
+    list.Add((XTensor*)&mean);
-    list.Add(&var);
+    list.Add((XTensor*)&var);
-    list.Add(&a);
+    list.Add((XTensor*)&a);
-    list.Add(&b);
+    list.Add((XTensor*)&b);
    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, epsilon);

--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
@@ -17,7 +17,6 @@
 /*
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-06 float16/int added
 */
 #include "ScaleAndShift.cuh"
@@ -35,9 +34,9 @@ scale and shift all tensor entires b = a * scale + shift (CUDA Kernel)
 >> scale - how much we want to scale it
 >> shift - how much we want to shift it
 */
-template<class T, bool isUnitScale, bool isZeroShift>
+template<bool isUnitScale, bool isZeroShift>
 __global__ 
-void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
+void KernelScaleAndShift(DTYPE * a, DTYPE * b, int size, DTYPE scale, DTYPE shift)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -57,6 +56,28 @@ void KernelScaleAndShift(T * a, T * b, int size, T scale, T shift)
    }
 }
+/* 
+scale and shift all tensor entires p = p * scale + shift (CUDA Kernel) 
+This is for float16 computation
+>> a - the input data array
+>> b - the output data array
+>> size - the size of d
+>> scale - how much we want to scale it
+>> shift - how much we want to shift it
+*/
+__global__ 
+void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+    if(i < size)
+        b[i] = __hadd(__hmul(a[i], scale), shift);
+#else
+    if (i < size)
+        b[i] = __float2half(__half2float(a[i]) * __half2float(scale) + __half2float(shift));
+#endif
+}
 /* 
 scale and shift all tensor entires
@@ -87,52 +108,20 @@ void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift
        if(a->dataType == DEFAULT_DTYPE){
            if(scale == 1.0F && shift == 0)
-                KernelScaleAndShift<DTYPE, true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<true, true> <<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<DTYPE, true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<true, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else if(scale != 1.0F && shift == 0)
-                KernelScaleAndShift<DTYPE, false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<false, true> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
            else
-                KernelScaleAndShift<DTYPE, false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
+                KernelScaleAndShift<false, false> << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum, scale, shift);
        }
        else if(a->dataType == X_FLOAT16){
-            half scale1 = __float2half(scale);
+            unsigned short scale2 = FloatToFloat16(scale);
-            half shift1 = __float2half(shift);
+            unsigned short shift2 = FloatToFloat16(shift);
+            __half * scaleft16p = (__half*)&scale2;
-            if (scale == 1.0F && shift == 0)
+            __half * shiftft16p = (__half*)&shift2;
-                KernelScaleAndShift<__half, true, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
+            KernelScaleAndShift<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, *scaleft16p, *shiftft16p);
-            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<__half, true, false><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
-            else if (scale != 1.0F && shift == 0)
-                KernelScaleAndShift<__half, false, true><<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
-            else
-                KernelScaleAndShift<__half, false, false> << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum, scale1, shift1);
-        }
-        else if (a->dataType == X_INT){
-            int scale2 = int(scale);
-            int shift2 = int(shift);
-            if (scale == 1.0F && shift == 0)
-                KernelScaleAndShift<int, true, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
-            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<int, true, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
-            else if (scale != 1.0F && shift == 0)
-                KernelScaleAndShift<int, false, true><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
-            else
-                KernelScaleAndShift<int, false, false><<<blocks, threads>>>((int *)a->data, (int *)b->data, a->unitNum, scale2, shift2);
-        }
-        else if (a->dataType == X_INT8){
-            __int8 scale2 = __int8(scale);
-            __int8 shift2 = __int8(shift);
-            if (scale == 1.0F && shift == 0)
-                KernelScaleAndShift<__int8, true, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
-            else if (scale == 1.0F && shift != 0)
-                KernelScaleAndShift<__int8, true, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
-            else if (scale != 1.0F && shift == 0)
-                KernelScaleAndShift<__int8, false, true> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
-            else
-                KernelScaleAndShift<__int8, false, false> << <blocks, threads >> >((__int8 *)a->data, (__int8 *)b->data, a->unitNum, scale2, shift2);
        }
        else{
            ShowNTErrors("TODO!");

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cpp
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cpp
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cu
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cuh
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cuh
--- a/source/tensor/function/SoftmaxWithCrossEntropy.h
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TDropout.cpp
+++ b/source/tensor/test/TDropout.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul.h
+++ b/source/tensor/test/TMatrixMul.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h