1. remove old interface of XNet

2. fix the bug of onehotandindex 3. modify the data initilization of transformer 4. add the DumpFormat (need reimplement)

1. remove old interface of XNet
2. fix the bug of onehotandindex 3. modify the data initilization of transformer 4. add the DumpFormat (need reimplement)
d3a0b984 · xuchen · a0aa3d49 · d3a0b984 · d3a0b984 · d3a0b984
Commit d3a0b984 authored Aug 05, 2019 by xuchen
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = false;
+    isGradEfficient = true;
 }
 /* de-constructor */
@@ -77,104 +77,20 @@ backward propagation to obtain gradient
 >> root - root node (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(XTensor &root)
 {
    TensorList roots(1);
    roots.Add(&root);
-    TensorList golds(1);
+    Backward(roots);
-    golds.Add(NULL);
-    TensorList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient wrt. the loss/error function
->> root - root node (output) of the network
->> gold - gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-    TensorList golds(1);
-    golds.Add(&gold);
-    TensorList paddings(1);
-    paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
-}
-/* 
-backward propagation to obtain gradient wrt. the loss/error function 
->> root - root node (output) of the network
->> gold - gold standard for the output
->> padding - specify a target value that is ignored and does not contribute to the gradient computation
->> loss - name of loss function
-*/
-void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
-{
-    TensorList roots(1);
-    roots.Add(&root);
-    TensorList golds(1);
-    golds.Add(&gold);
-    TensorList paddings(1);
-    paddings.Add(&padding);
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
-{
-    TensorList golds(roots.count);
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++) {
-        golds.Add(NULL);
-        paddings.Add(NULL);
-    }
-    Backward(roots, golds, paddings, loss);
-}
-/*
-backward propagation to obtain gradient
-with a number of root nodes
->> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> loss - name of loss function
-*/
-void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
-{
-    TensorList paddings(roots.count);
-    for (int i = 0; i < roots.count; i++)
-        paddings.Add(NULL);
-    Backward(roots, golds, paddings, loss);
 }
 /* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> roots - a list of root nodes (output) of the network
->> golds - a list of gold standard for the output
->> paddings - specify a target value that is ignored
->> loss - name of loss function
 */
-void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots)
 {
    Traverse(roots);
@@ -187,39 +103,6 @@ void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, 
        node->visitMark = NODE_UNFINISHED;
    }
-    //XLossGrad lossGrad;
-    /* we start with the gradient with respect to the loss for output layers */
-    /*for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;*/
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }*/
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        /*else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }*/
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
        XTensor * node = (XTensor*)nodes.Get(i);
@@ -267,15 +150,10 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
        else if(XLossGrad::IsLossOP(node))
-			XLossGrad::MakeGrad(node, isEfficent);
+            XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
-        //FILE *f = fopen("debug", "a");
-        //node->Dump(f, "node", 10);
-        //if (node->grad != NULL)
-        //    node->grad->Dump(f, "node->grad", 10);
    }
    else{
        node->visitMark = NODE_FINISHED;
@@ -465,7 +343,6 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
    }
 }
 /*
 search for a node in a top-down manner by its name
 >> top - the top most node
@@ -473,7 +350,7 @@ search for a node in a top-down manner by its name
 */
 //XTensor * XNet::SearchNode(XTensor * top, const char * name)
 //{
-	//return XLink::SearchNode(top, name);
+    //return XLink::SearchNode(top, name);
 //}
 }
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -61,25 +61,11 @@ struct XNet
    void Clear();
    /* backward propagation to obtain gradient */
-    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
-    /* backward propagation to obtain gradient
-       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -68,8 +68,8 @@ void Read(const char * fn, FNNModel &model);
 void Test(const char * test, const char * result, FNNModel &model);
 int  LoadNGrams(FILE * file, int n, NGram * ngrams, int sentNum, int wordNum);
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
-                         int itemNum, int devID, XMem * mem);
+                         int itemNum, int devID);
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem);
+void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID);
 void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
 void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
              FNNModel &model, FNNModel &grad, FNNNet &net);
@@ -229,11 +229,6 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
            fprintf(stderr, " -dev=%d\n", model.devID);
        }
    }
-    for(int i = 0; i < argc; i++){
-        if (!strcmp(argv[i], "-mem"))
-            model.mem = new XMem(model.devID, FREE_ON_THE_FLY, 256 * MILLION, 512, 256 * MILLION);
-    }
 }
 /* check model settings */
@@ -262,11 +257,6 @@ void Copy(FNNModel &tgt, FNNModel &src)
    tgt.vSize = src.vSize;
    tgt.devID = src.devID;
    tgt.useMemPool = src.useMemPool;
-    if(src.mem != NULL){
-        tgt.mem = new XMem(src.mem->devID, src.mem->mode, 
-                           src.mem->maxBlockSize, src.mem->blockNum, 
-                           src.mem->bufSize);
-    }
 }
 /* 
@@ -332,7 +322,7 @@ void Init(FNNModel &model)
    /* create embedding parameter matrix: vSize * eSize */
    InitModelTensor2D(model.embeddingW, model.vSize, model.eSize, model);
    model.embeddingW.SetVarFlag();
    /* create hidden layer parameter matrics */
    for(int i = 0; i < model.hDepth; i++){
        /* hidden layer parameter matrix: (n-1)eSize * hsize if it is the first layer
@@ -351,9 +341,8 @@ void Init(FNNModel &model)
    /* create the output layer parameter matrix and bias term */
    int iSize = model.hDepth == 0 ? (model.n - 1) * model.eSize : model.hSize;
    InitModelTensor2D(model.outputW, iSize, model.vSize, model);
-    model.outputW.SetVarFlag();
    InitModelTensor1D(model.outputB, model.vSize, model);
+    model.outputW.SetVarFlag();
    model.outputB.SetVarFlag();
    /* then, we initialize model parameters using a uniform distribution in range
@@ -460,10 +449,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
-                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
+                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
            /* make the gold tensor */
-            MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
+            MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
            if(!autoDiff){
                /* prepare an empty network for building the fnn */
@@ -490,7 +479,11 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                Clear(model, true);
                /* forward + backward process */
+                /* this is implemented by gather function */
                ForwardAutoDiff(ngrams, ngramNum, output, model);
+                /* this is implemented by multiply function */
                lossTensor = CrossEntropy(output, gold);
                /* automatic differentiation */
@@ -719,10 +712,9 @@ The indexed cell is set to 1, and 0 otherwise.
 >> cols - column index
 >> itemNum - number of non-zero items
 >> devID - device id
->> mem - memory pool
 */
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
-                         int itemNum, int devID, XMem * mem)
+                         int itemNum, int devID)
 {
    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
@@ -741,9 +733,8 @@ make a tensor that encodes a batch of words
 >> n - indicate which word is encode for each ngram
 >> vSize - vocabulary size
 >> devID - device id
->> mem - memory pool
 */
-void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID, XMem * mem)
+void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSize, int devID)
 {
    int * rows = new int[ngramNum];
    int * cols = new int[ngramNum];
@@ -753,7 +744,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
        cols[i] = ngrams[i].words[n];
    }
-    InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID, mem);
+    InitZeroOneTensor2D(batch, ngramNum, vSize, rows, cols, ngramNum, devID);
    delete[] rows;
    delete[] cols;
@@ -1162,10 +1153,10 @@ void Test(const char * test, const char * result, FNNModel &model)
        /* make the input tensor for position i */
        for (int i = 0; i < model.n - 1; i++)
-            MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
+            MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID);
        /* make the gold tensor */
-        MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
+        MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID);
        if (!autoDiff) {
            /* prepare an empty network for building the fnn */
@@ -1174,8 +1165,8 @@ void Test(const char * test, const char * result, FNNModel &model)
            /* forward computation */
            Forward(inputs, output, model, net);
        }
-        else {			
+        else {            
-			/* this is implemented by gather function */
+            /* this is implemented by gather function */
            ForwardAutoDiff(ngrams, ngramNum, output, model);
            output = Log(output);

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -51,14 +51,12 @@ initialize the model
 >> myIgnored - number of position ignored in attention (from the begining)
 >> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
->> myMem - the memory pool
 */
 void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
-                             int myDevID, XMem * myMem)
+                             int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    isMasked = myIsMasked;
    ignored = myIgnored;
@@ -71,23 +69,18 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);
    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    _SetDataFanInOut(&wk, scale);
-    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
+    _SetDataFanInOut(&wq, scale);
-    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    _SetDataFanInOut(&wv, scale);
-    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
+    _SetDataFanInOut(&wa, scale);
+    _SetDataFanInOut(&wbig, scale);
-    wk.SetDataRand(-finfoutk, finfoutk);
-    wq.SetDataRand(-finfoutk, finfoutk);
-    wv.SetDataRand(-finfoutv, finfoutv);
-    wa.SetDataRand(-finfouta, finfouta);
-    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }
 /* 
@@ -135,9 +128,9 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    int d2 = kqv2.GetDim(1);
    int d3 = kqv2.GetDim(2) / 3;
-    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
-    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
    split.Add(&q2);
    split.Add(&k2);

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -42,9 +42,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* head number */
    int nhead;
@@ -61,7 +58,7 @@ public:
    XTensor wa;
    XTensor wbig;
    /* size of transformed Q and K */
    int dk;
@@ -94,7 +91,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
@@ -86,7 +86,7 @@ struct SampleNode
    int * p;
    int size;
    int value;
-	int key;
+    int key;
 };
 int CompareSampleNode(const void * a, const void * b)
@@ -280,7 +280,6 @@ load a batch of sequences
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatch(FILE * file, bool isLM, 
@@ -290,18 +289,17 @@ int T2TBatchLoader::LoadBatch(FILE * file, bool isLM,
                          int * seqs,
                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &ws, int &wCount,
-                          int devID, XMem * mem, 
+                          int devID, bool isTraining)
-						  bool isTraining)
 {
    if(isLM){
        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
                           seqs, vsEnc, sBatch, wBatch, 
-                           isSorted, wCount, devID, mem, isTraining);
+                           isSorted, wCount, devID, isTraining);
    }
    else{
        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
                           seqs, vsEnc, vsDec, sBatch, wBatch, 
-                           isSorted, ws, wCount, devID, mem, isTraining);
+                           isSorted, ws, wCount, devID, isTraining);
    }
 }
@@ -322,7 +320,6 @@ load a batch of sequences (for LM)
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatchLM(FILE * file, 
@@ -332,8 +329,7 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
                            int * seqs,
                            int vSize, int sBatch, int wBatch, 
                            bool isSorted, int &wCount,
-                            int devID, XMem * mem,
+                            int devID, bool isTraining)
-							bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 1);
@@ -369,11 +365,11 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    dims[1] = max;
    dims[2] = vSize;
-    InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
+    InitTensor2DV2(batchEnc, sc, max, X_INT, devID);
-    InitTensor2D(label, sc, max, X_INT, devID, mem);
+    InitTensor2DV2(label, sc, max, X_INT, devID);
-    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
+    InitTensorV2(gold, 3, dims, X_FLOAT, devID);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
+    InitTensor2DV2(paddingEnc, sc, max, X_FLOAT, devID);
-    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
+    InitTensor2DV2(paddingDec, sc, max, X_FLOAT, devID);
    batchEnc->SetZeroAll();
    label->SetZeroAll();
@@ -437,12 +433,12 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);
-    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    /*XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
    _ConvertDataType(batchEnc, tmp);
    _NotEqual(tmp, paddingEnc, 0);
    DelTensorBuf(tmp);
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
    _ConvertDataType(batchEnc, tmp2);
    _NotEqual(tmp2, paddingDec, 0);
    DelTensorBuf(tmp2);*/
@@ -481,7 +477,6 @@ load a batch of sequences (for MT)
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatchMT(FILE * file, 
@@ -491,8 +486,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
                            int * seqs,
                            int vSizeEnc, int vSizeDec, int sBatch, int wBatch, 
                            bool isSorted, int &ws, int &wCount,
-                            int devID, XMem * mem, 
+                            int devID, bool isTraining)
-							bool isTraining)
 {
    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);
@@ -569,12 +563,12 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    int sCount = sc/2;
    int seqSize = 0;
-    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID, mem);
+    InitTensor2DV2(batchEnc, sCount, maxEnc, X_INT, devID);
-    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
+    InitTensor2DV2(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
-    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID, mem);
+    InitTensor2DV2(batchDec, sCount, maxDec, X_INT, devID);
-    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
+    InitTensor2DV2(paddingDec, sCount, maxDec, X_FLOAT, devID);
-    InitTensor2D(label, sCount, maxDec, X_INT, devID, mem);
+    InitTensor2DV2(label, sCount, maxDec, X_INT, devID);
-    //InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+    //InitTensorV2(gold, 3, dimsDec, X_FLOAT, devID);
    batchEnc->SetZeroAll();
    paddingEnc->SetZeroAll();
@@ -613,7 +607,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    ws = wCountEnc;
    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-    //XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    //XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
    //_ConvertDataType(batchEnc, tmp);
    //tmp->Dump(stderr, "tmp:");
    //_NotEqual(tmp, paddingEnc, 0);
@@ -662,7 +656,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    label->SetData(labelValues, label->unitNum);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);
-    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    //XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
    //_ConvertDataType(batchDec, tmp2);
    //_NotEqual(tmp2, paddingDec, 0);
    //DelTensorBuf(tmp2);

--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
@@ -131,8 +131,7 @@ public:
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
+                  int devID, bool isTraining);
-				  bool isTraining);
    /* load a batch of sequences (for language modeling) */
    int LoadBatchLM(FILE * file, 
@@ -141,8 +140,7 @@ public:
                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
+                    int devID, bool isTraining);
-					bool isTraining);
    /* load a batch of sequences (for machine translation) */
    int LoadBatchMT(FILE * file, 
@@ -151,8 +149,7 @@ public:
                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
+                    int devID, bool isTraining);
-					bool isTraining);
    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -57,16 +57,14 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttDecoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -79,7 +77,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");
    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem, false);
+    embedder.InitModel(argc, argv, devID, false);
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -90,12 +88,12 @@ void AttDecoder::InitModel(int argc, char ** argv,
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -37,9 +37,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* layer number */
    int nlayer;
@@ -95,7 +92,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the decoding network */
    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TEmbedder::T2TEmbedder()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    maxLength = -1;
 }
@@ -46,12 +45,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
 {
    devID = myDevID;
-    mem = myMem;
    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
@@ -64,7 +61,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, b
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
    w.SetDataRandn(0, v);
@@ -81,7 +78,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
    float * data = new float[posEmbeddingBase.unitNum];
@@ -145,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);
-        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);
        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -41,9 +41,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* vocabulary size */
    int vSize;
@@ -71,7 +68,7 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
+    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);
    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -52,15 +52,12 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
+>> myDevID - device id*/
->> myMem - the memory pool
-*/
 void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -73,7 +70,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem);
+    embedder.InitModel(argc, argv, devID);
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -82,10 +79,10 @@ void AttEncoder::InitModel(int argc, char ** argv,
    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -65,9 +65,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* layer number */
    int nlayer;
@@ -118,7 +115,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);
    /* make the encoding network */
    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -47,12 +47,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    float minmax = 0;
@@ -62,19 +60,17 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);
-    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
-    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);
-    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
-    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);
    float scale = 1.0F;
-    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));
+    _SetDataFanInOut(&w1, scale);
-    float finfout2 = (float)sqrt(6.0F * scale/(hSize + outSize));
+    _SetDataFanInOut(&w2, scale);
-    w1.SetDataRand(-finfout1, finfout1);
    b1.SetZeroAll();
-    w2.SetDataRand(-finfout2, finfout2);
    b2.SetZeroAll();
 }

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -36,9 +36,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* size of input vector */
    int inSize;
@@ -72,7 +69,7 @@ public:
    ~T2TFNN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input, bool isTraining);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem = NULL;
    d = 0;
 }
@@ -46,18 +45,16 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-    InitTensor1D(&w, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();

--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -36,9 +36,6 @@ class T2TLN
 public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* the transformation matrix w */
    XTensor w;
@@ -57,7 +54,7 @@ public:
    ~T2TLN();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TModel::T2TModel()
 {
    devID = -1;
-    mem = NULL;
    isLM = false;
    isMT = false;
    nhead = 1;
@@ -48,10 +47,6 @@ T2TModel::~T2TModel()
    delete encoder;
    delete decoder;
    delete outputLayer;
-    /* we delete "mem" at the end because other members are using it and we must
-       remove the memory space before all tensors are destroyed. */
-    delete mem;
 }
 /* 
@@ -61,29 +56,16 @@ initialize the model
 */
 void T2TModel::InitModel(int argc, char ** argv)
 {
-    bool useMem = false;
-    int memSize = 0;
-    bool isMemFreeOTF = false;
    LoadParamInt(argc, argv, "dev", &devID, -1);
-    LoadParamBool(argc, argv, "mem", &useMem, useMem);
-    LoadParamInt(argc, argv, "memsize", &memSize, 1024);
    LoadParamBool(argc, argv, "mt", &isMT, false);
    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-    LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
-    if(useMem){
-        delete mem;
-        mem = new XMem(devID, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
-        mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
-    }
-    encoder->InitModel(argc, argv, true, 0, devID, mem);
+    encoder->InitModel(argc, argv, true, 0, devID);
-    outputLayer->InitModel(argc, argv, devID, mem);
+    outputLayer->InitModel(argc, argv, devID);
    if(isMT)
-        decoder->InitModel(argc, argv, true, 0, devID, mem);
+        decoder->InitModel(argc, argv, true, 0, devID);
    TensorList params(10);
    GetParams(params);
@@ -149,7 +131,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
    dims[input.order + 1] = len;
-    XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
+    XTensor mask;
+    InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
@@ -163,15 +146,15 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);
-    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
+    XTensor * padding2 = NewTensorBufV2(padding.order + 1, dimsPadding, padding.dataType,
-                                      padding.denseRatio, padding.devID, padding.mem);
+                                        padding.devID);
    for(int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
+    //XTensor * padding3 = NewTensorBufV2(padding.order + 2, dimsPadding, padding.dataType,
-    //                                  padding.denseRatio, padding.devID, padding.mem);
+    //                                    padding.devID);
    //    
    ///* mask of the padding */
    //_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
@@ -241,7 +224,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -251,11 +234,11 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -271,15 +254,15 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                        paddingEnc.devID);
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                        paddingEnc.devID);
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -287,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();
    /* generate the mask on the source language side (for padding) */
@@ -315,15 +298,15 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                        paddingEnc.devID);
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                        paddingEnc.devID);
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -331,7 +314,7 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();
    /* generate the mask on the source language side (for padding) */
@@ -361,7 +344,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       This matrix can be used to block the attention to current or following words in
@@ -376,11 +359,11 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -40,9 +40,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* the encoder */
    AttEncoder * encoder;

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TOutput::T2TOutput()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    inSize = -1;
    hSize = -1;
@@ -47,12 +46,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    float minmax = 0;
@@ -61,7 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);
-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));

--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -38,9 +38,6 @@ public:
    /* device id */
    int devID;
-    /* memory pool */
-    XMem * mem;
    /* vocabulary size */
    int vSize;
@@ -61,7 +58,7 @@ public:
    ~T2TOutput();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -105,9 +105,9 @@ void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;
-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);
    state->probPath.SetZeroAll();
    state->nstep.SetZeroAll();
@@ -170,7 +170,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;
-    InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
    _SetDataFixedInt(&first, startSymbol);
    /* add a new word into the input sequence of the decoder side */
@@ -179,7 +179,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    }
    else{
        inputDec = GeneratePaths(s);
-        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
+        inputDec.SetDevice(inputEnc->devID);
        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
    }
@@ -194,7 +194,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    dims[inputDec.order - 1] = inputDec.GetDim(-1);
    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);
    XTensor maskDec;
@@ -213,14 +213,14 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    int stride = decoding.GetDim(decoding.order - 2);
-    InitTensor1D(&selectSrc, 1, X_INT);
+    InitTensor1DV2(&selectSrc, 1, X_INT);
-    InitTensor1D(&selectTgt, 1, X_INT);
+    InitTensor1DV2(&selectTgt, 1, X_INT);
    selectSrc.SetInt(stride - 1, 0);
    selectTgt.SetInt(0, 0);
-    selectSrc.SetDevice(decoding.devID, decoding.mem);
+    selectSrc.SetDevice(decoding.devID);
-    selectTgt.SetDevice(decoding.devID, decoding.mem);
+    selectTgt.SetDevice(decoding.devID);
    /* the decoder output of the last position */
    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
@@ -257,7 +257,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
    }
    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();
    for(int i = 0; i < state->stateNum; i++){

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -141,10 +141,6 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
        /* push complete hypotheses into the heap */
        Collect(next);
-        /* stop searching when all hypotheses are completed */
-        if(IsAllCompleted(next))
-            break;
    }
    /* fill the heap with imcomplete hypotheses if neccesary */
@@ -196,8 +192,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
    for(int i = 0; i < order; i++)
        dims[i] = prob.GetDim(i);
-    InitTensor(&score, &prob);
+    InitTensorV2(&score, &prob);
-    InitTensor(&probPath, &prob);
+    InitTensorV2(&probPath, &prob);
    prob.Reshape(prob.unitNum/outputSize, outputSize);
    score.Reshape(score.unitNum/outputSize, outputSize);
@@ -208,8 +204,8 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
    _SumDim(&prob, &probPathPrev, &probPath, 0);
-    InitTensor(&len, &lenPrev);
+    InitTensorV2(&len, &lenPrev);
-    InitTensor(&lp, &lenPrev);
+    InitTensorV2(&lp, &lenPrev);
    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);
@@ -229,9 +225,9 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
        _SumDim(&score, &firstMask, &score, 0);
    }
-    InitTensor(&mask, 
+    InitTensorV2(&mask, 
-               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F, 
+               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 
-               prev->endMark.devID, prev->endMark.mem);
+               prev->endMark.devID);
    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
    mask.Reshape(mask.unitNum);
@@ -266,15 +262,15 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    XTensor &prob = beam->prob;
    int order = score.order;
+    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
+    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
    for (int i = 0; i < order; i++) {
        dims[i] = score.GetDim(i);
        dimsBeam[i] = score.GetDim(i);
        dimsTopK[i] = score.GetDim(i);
    }
-    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
-    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
    int sizeVocab = score.GetDim(-1);
    int stride = score.GetDim(-1);
@@ -283,12 +279,11 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    dimsTopK[order - 3] = dimsBeam[order - 3];
    dimsTopK[order - 1] = beamSize;
-    InitTensor(&scoreTopK, order, dimsTopK, score.dataType,
+    InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType,
-               1.0F, score.devID, score.mem);
+                 score.devID);
-    InitTensor(&index, order, dimsTopK, X_INT,
+    InitTensorV2(&index, order, dimsTopK, X_INT,
-               1.0F, score.devID, score.mem);
+                 score.devID);
-    InitTensor(&preID, order, dimsTopK, X_INT,
+    InitTensorV2(&preID, order, dimsTopK, X_INT, -1);
-               1.0F, -1);
    score.Reshape(order, dimsBeam);
@@ -312,7 +307,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    score.Reshape(order, dims);
    /* we keep the top-k scores */
-    InitTensor(&score, &scoreTopK);
+    InitTensorV2(&score, &scoreTopK);
    CopyValues(scoreTopK, score);
    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
@@ -328,9 +323,9 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
-    InitTensor(&probPathTopK, &scoreTopK);
+    InitTensorV2(&probPathTopK, &scoreTopK);
    XTensor probTopK;
-    InitTensor(&probTopK, &scoreTopK);
+    InitTensorV2(&probTopK, &scoreTopK);
    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
@@ -386,7 +381,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
    InitTensorOnCPU(&probPath, &probPathRef);
    InitTensorOnCPU(&prediction, &predictionRef);
    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensor(&endMark, &predictionRef);
+    InitTensorV2(&endMark, &predictionRef);
    /* we copy the data to CPU because the frequent access to GPU is slow
       and we can speed-up the process by doing the job on CPU. */
@@ -507,7 +502,7 @@ void T2TSearch::Dump(XTensor * output)
    int dims[3] = {batchSize, beamSize, maxLength};
    int * words = new int[maxLength];
-    InitTensor(output, 3, dims, X_INT);
+    InitTensorV2(output, 3, dims, X_INT);
    SetDataFixedInt(*output, -1);
    /* heap for an input sentence in the batch */
@@ -578,23 +573,6 @@ void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
    endSymbolNum = tokenNum;
 }
-/* 
-check whether all hypotheses are completed 
->> beam - the beam that keeps the searching states
-*/
-bool T2TSearch::IsAllCompleted(T2TStateBundle * beam)
-{
-    T2TState * states = beam->states;
-     for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
-        if(!state.isCompleted)
-            return false;
-     }
-     return true;
-}
 /*
 make a mask to prevent duplicated entries in beam expansion for the first position
 >> beam - the beam that keeps the searching states
@@ -609,7 +587,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    for (int i = 0; i < order - 1; i++)
        dims[i] = prob.GetDim(i);
-    InitTensor(&mask, order - 1, dims, X_FLOAT);
+    InitTensorV2(&mask, order - 1, dims, X_FLOAT);
    mask.SetZeroAll();
    for (int i = 0; i < mask.unitNum; i++) {
@@ -617,7 +595,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
            mask.Set(-1e9, i);
    }
-    mask.SetDevice(prob.devID, prob.mem);
+    mask.SetDevice(prob.devID);
    return mask;
 }

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -102,9 +102,6 @@ public:
    /* set end symbols for search */
    void SetEnd(const int * tokens, const int tokenNum);
-    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle * beam);
    /* make a mask to prevent duplicated entries in beam expansion for the first position */
    XTensor MakeFirstMask(T2TStateBundle * beam);
 };

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -75,7 +75,6 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    CheckNTErrors(ofile, "Cannot open the output file");
    int devID = model->devID;
-    XMem * mem = model->mem;
    XNet net;
@@ -106,7 +105,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    while(batchLoader.LoadBatch(file, model->isLM, 
                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, mem, false))
+                                1, 1, false, ws, wc, devID, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
        CheckNTErrors(!model->isLM, "Only MT model is supported!");
@@ -129,7 +128,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        if (batchCount % 1 == 0) {
            double elapsed = GetClockSec() - startT;
            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
+                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
                    elapsed, sentCount, wordCount);
        }
    }
@@ -141,8 +140,8 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    double elapsed = GetClockSec() - startT;
-    XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
+    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
-            elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
+            elapsed,wordCountTotal, exp(loss/wordCount));
 }
 /*

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -75,9 +75,6 @@ void T2TTrainer::Init(int argc, char ** argv)
        strcpy(argArray[i], argv[i]);
    }
-    bool useMem = false;
-    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamFloat(argc, argv, "lrate", &lrate, 1.0F);
    LoadParamFloat(argc, argv, "lrbias", &lrbias, 0);
    LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
@@ -142,7 +139,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
 #endif
    int devID = model->devID;
-    XMem * mem = model->mem;
    XNet net;
    if(isDebugged)
@@ -184,7 +180,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        while (batchLoader.LoadBatch(file, model->isLM, 
                                     &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
                                     NULL, vSize, vSizeTgt,
-                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, mem, true)) 
+                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true)) 
        {
            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
@@ -321,7 +317,6 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    CheckNTErrors(ofile, "Cannot open the output file");
    int devID = model->devID;
-    XMem * mem = model->mem;
    XNet net;
@@ -351,7 +346,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    while(batchLoader.LoadBatch(file, model->isLM, 
                                &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, mem, false))
+                                1, 1, false, ws, wc, devID, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
@@ -372,7 +367,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, bSize * length);
+        InitTensor1DV2(&probs, bSize * length);
        XTensor labelOnehot;
@@ -463,7 +458,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID, output->mem);
+    InitTensor1DV2(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
    int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
    probs.Reshape(2, dims);
@@ -480,7 +475,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output->devID, output->mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output->devID);
    _ReduceSum(&probs, &result, 1);
    return result.Get1D(0);
@@ -527,7 +522,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);
            /* v2 = m / (sqrt(v) + delta) */
-            XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
+            XTensor * v2 = NewTensorBufV2(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
            _Div(m, v2, v2);
@@ -598,7 +593,7 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)
    output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);
-    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
+    XTensor * padding2 = NewTensorBufV2(1, &padding->unitNum, X_FLOAT, padding->devID);
    _CopyValues(padding, padding2);
    _MultiplyDim(output, padding2, output, 0);
@@ -652,7 +647,7 @@ void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
    DTYPE q = 1.0F - p;
    DTYPE gift = p / n;
-    InitTensor(smoothed, gold);
+    InitTensorV2(smoothed, gold);
    _CopyValues(gold, smoothed);
    if(p == 0)

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -528,8 +528,6 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
            CheckNTErrors(hit, "No proper node found in parent.income edge!");
        }
    }
-    strcpy(newOne->name, oldOne->name);
 }
@@ -737,11 +735,6 @@ void XLink::ShowNode(FILE * file, XTensor * node)
        }
    }
-    fprintf(file, "shape[%d] ", node->order);
-    for (int i = 0; i < node->order; i++)
-        fprintf(file, "%d ", node->GetDim(i));
    fprintf(stderr, "\n");
 }

--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -53,6 +53,7 @@ XMem::XMem()
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    isInitialized = false;
 }
 /* 
@@ -169,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
 #endif
    signature++;
+    isInitialized = true;
 }
 /* free memory */
@@ -305,7 +307,7 @@ void XMem::SetComputationMode(bool myIsForComputation)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-				      "Cannot create the cublas handle.");
+                      "Cannot create the cublas handle.");
    SetDevice(devIDBackup);
 #endif
@@ -321,11 +323,11 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
 {
    delete[] memIndex;
    delete[] memIndex2;
-	delete[] minSizeIndex;
+    delete[] minSizeIndex;
-	nodeNum = indexSize;
+    nodeNum = indexSize;
-	nodeNumUsed = minSizeNum * 2;
+    nodeNumUsed = minSizeNum * 2;
-	indexEntryNum = minSizeNum;
+    indexEntryNum = minSizeNum;
    memIndex = new MPieceNode[nodeNum];
    memset(memIndex, 0, sizeof(MPieceNode) * nodeNum);
@@ -333,12 +335,12 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
    memIndex2 = new MPieceNode[nodeNum];
    memset(memIndex2, 0, sizeof(MPieceNode) * nodeNum);
-	minSizeIndex = new MTYPE[indexEntryNum];
+    minSizeIndex = new MTYPE[indexEntryNum];
-	memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);
+    memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);
-	minSizeIndex[0] = minSizeFirst;
+    minSizeIndex[0] = minSizeFirst;
-	for(int i = 1; i < indexEntryNum; i++)
+    for(int i = 1; i < indexEntryNum; i++)
-		minSizeIndex[i] = minSizeIndex[i - 1] * 2;
+        minSizeIndex[i] = minSizeIndex[i - 1] * 2;
    indexOffset = GetMSB(minSizeFirst);
 }
@@ -757,8 +759,8 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)
    /* if all index nodes are used, we rebuild the index to release the nodes that are free */
    if(nodeNumUsed == nodeNum){
-    	RebuildIndex();
+        RebuildIndex();
-    	CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
+        CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
    }
    /*if(testxmemid == 30){
@@ -961,8 +963,8 @@ release a piece of memory as "free"
 */
 void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
 {
-	if(p == NULL)
+    if(p == NULL)
-		return;
+        return;
    if(size <= minSizeIndex[0])
        size = minSizeIndex[0];
@@ -1092,7 +1094,7 @@ void XMem::RebuildIndex()
            block->mem = NULL;
        }
        else{
-        	/* if the block is in use, we build the index */
+            /* if the block is in use, we build the index */
            int pieceCount = 0;
            MTYPE size = 0;
            MHeader * newLast = NULL;
@@ -1579,11 +1581,6 @@ void XMemManager::Initialize()
    /* CPUs (we actually do not care about how many CPUs are using) */
    nCPUMem = 1;
-    MTYPE freeMem = GetAvailableMemory();
-    MTYPE myBufSize = 0;
-    GetBufferSize(freeMem, &myBufSize);
-    CPUMems[0].Initialize(-1, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
    /* GPUs */
    nGPUMem = 0;
@@ -1592,23 +1589,16 @@ void XMemManager::Initialize()
        XPRINT(0, stderr, "cannot get GPU information.");
        exit(1);
    }
-    for (int i = 0; i < nGPUMem; i++) {
-        MTYPE freeMem = GetAvailableGPUMemory(i);
-        MTYPE myBufSize = 0;
-        GetBufferSize(freeMem, &myBufSize);
-        GPUMems[i].Initialize(i, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
-    }
 #endif
 }
 /* free it */
 void XMemManager::Free()
 {
-    for (int i = 0; i < MAX_CPU_NUM; i++)
+    for (int i = 0; i < MAX_CPU_MEM_NUM; i++)
        CPUMems[i].Free();
-    for (int i = 0; i < MAX_GPU_NUM; i++)
+    for (int i = 0; i < MAX_GPU_MEM_NUM; i++)
        GPUMems[i].Free();
 }
@@ -1616,13 +1606,34 @@ void XMemManager::Free()
 XMem * XMemManager::GetMem(const int devID)
 {
    XMem * mem = NULL;
-    if (devID < 0)
+    if (devID < 0){
+        if(!CPUMems[0].isInitialized){
+            MTYPE freeMem = GetAvailableMemory();
+            MTYPE myBufSize = 0;
+            GetBufferSize(freeMem, &myBufSize);
+            CPUMems[0].Initialize(-1, FREE_ON_THE_FLY, 
+                                  MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                  MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                  myBufSize);
+        }
        mem = CPUMems;
+    }
    else{
-        if (devID < nGPUMem)
+        if (devID < nGPUMem){
+            if(!GPUMems[devID].isInitialized){
+                MTYPE freeMem = GetAvailableGPUMemory(devID);
+                MTYPE myBufSize = 0;
+                GetBufferSize(freeMem, &myBufSize);
+                GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY, 
+                                          MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                          MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                          myBufSize);
+            }
            mem = GPUMems + devID;
-        else
+        }
+        else{
            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+        }
    }
    return mem;

--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -60,10 +60,10 @@ typedef long long          INT_64;
 #define CUDA_HOST_MALLOC 1
 #define MY_PITCH CUDA_PITCH
 #define BUF_PITCH 256
-#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
+#define MIN_BLOCK_SIZE_FOR_MEMPOOL 256 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
-#define MAX_CPU_NUM 16
+#define MAX_CPU_MEM_NUM 16
-#define MAX_GPU_NUM 16
+#define MAX_GPU_MEM_NUM 16
 /* 
 mode of runnig a memory pool 
@@ -213,6 +213,9 @@ public:
    MTYPE curUsedPin;
    MTYPE bufUsedPin;
+    /* indicates whether the memory pool is initialized */
+    bool isInitialized;
 #ifdef USE_CUDA
    /* handle used for cublas */
    cublasHandle_t cublasHandle;
@@ -429,15 +432,15 @@ a class for the management of memory
 */
 class XMemManager
 {
-public:
+private:
    /* cpu memory pool information */
-    XMem CPUMems[MAX_CPU_NUM];
+    XMem CPUMems[MAX_CPU_MEM_NUM];
    /* number of cpu memory pools */
    int nCPUMem;
    /* gpu memory pool information */
-    XMem GPUMems[MAX_GPU_NUM];
+    XMem GPUMems[MAX_GPU_MEM_NUM];
    /* number of gpu memory pools */
    int nGPUMem;

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -526,7 +526,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
        isInGlobalMem = false;
    }
    else {
-        ShowNTErrors("TODO!");
+        myMem = GMems.GetMem(myDevId);
    }
 }
@@ -923,11 +923,9 @@ set the tensor items by a normal distribution
 void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
 {
    // TODO: cuda code!!!!!!!
    if (data == NULL)
        return;
-    // srand((unsigned)time(0));
    void * d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
@@ -1330,7 +1328,7 @@ set the value of a cell
 */
 bool XTensor::Set(DTYPE value, int index[], int size)
 {
-	CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
+    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
    return SetToDevice(devID, GetCell(index, size), value);
 }
@@ -1826,7 +1824,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        fprintf(file, "%s ", label);
    if(isInit){
-        fprintf(file, "id=%d ", id);
        fprintf(file, "order=%d dimsize=", order);
        for (int i = 0; i < order; i++) {
            fprintf(file, "%d", dimSize[i]);
@@ -1879,7 +1876,149 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
            fprintf(file, "[%d]%e ", key, value);
        }
    }
-    fprintf(file, "\n\n");
+    fprintf(file, "\n");
+    if (isNewData) {
+        delete[](char*)d;
+#ifdef USE_CUDA
+        if (devID >= 0)
+            dataHost = NULL;
+#endif
+    }
+}
+void * RecursionData(XTensor * s, int dim, int * index, void * d, FILE * file)
+{
+    if (dim == s->order - 2) {
+        /* print index */
+        printf("Index: ");
+        for (int i = 0; i < s->order-2; i++)
+            printf("[%d]", index[i]);
+        int dimSize1 = s->dimSize[dim];
+        int dimSize2 = s->dimSize[dim+1];
+        printf("  %d * %d\n", dimSize1, dimSize2);
+        /* print 2D data */
+        if (s->dataType == X_FLOAT) {
+            float * data = (float*)d;
+            for (int i = 0; i < dimSize1; i++) {
+                printf("\t");
+                for (int j = 0; j < dimSize2; j++)
+                    fprintf(file, "%e ", *data++);
+                fprintf(file, "\n");
+            }
+            d = (float*)d + dimSize1 *dimSize2;
+        }
+        else if (s->dataType == X_INT) {
+            int * data = (int*)d;
+            for (int i = 0; i < dimSize1; i++) {
+                printf("\t");
+                for (int j = 0; j < dimSize2; j++)
+                    fprintf(file, "%d ", *data++);
+                fprintf(file, "\n");
+            }
+            d = (int*)d + dimSize1 *dimSize2;
+        }
+        else
+            ShowNTErrors("TODO!");
+        return d;
+    }
+    /* recursion for deeper dimsion */
+    int levelSize = s->dimSize[dim];
+    for (int k = 0; k < levelSize; k++) {
+        index[dim] = k;
+        d = RecursionData(s, dim+1, index, d, file);
+    }
+    return d;
+}
+/* 
+dump data to a file 
+>> file - where to domp the data
+>> label - label of the tensor
+>> n - number of items to dump
+>> beg - the first item id
+>> verbose - verbose level
+*/
+void XTensor::DumpFormat(FILE * file, const char * label, const int n, const int beg, const int verbose)
+{
+    if (verbose > verboseLevel)
+        return;
+    void * d = data;
+    bool isNewData = false;
+#ifdef USE_CUDA
+    if (devID >= 0) {
+        CudaGPUToCPUFlush(this);
+        d = dataHost;
+        isNewData = true;
+    }
+#endif
+    if (d == NULL) {
+        if (isSparse) {
+            int num = 0;
+            for (int i = 0; i < order; i++)
+                num *= dimSizeRDI[i];
+            num = int(num * denseRatio + 1);
+            int tupleSize = sizeof(int) + sizeof(DTYPE);
+            int size = sizeof(int) + tupleSize*(num);
+            d = new char[size];
+            memset(d, 0, size);
+        }
+        else {
+            d = new char[unitNum * unitSize];
+            memset(d, 0, unitNum * unitSize);
+        }
+        isNewData = true;
+    }
+    if (label != NULL)
+        fprintf(file, "%s ", label);
+    if(isInit){
+        fprintf(file, "id=%d ", id);
+        fprintf(file, "order=%d dimsize=", order);
+        for (int i = 0; i < order; i++) {
+            fprintf(file, "%d", dimSize[i]);
+            if (i < order - 1)
+                fprintf(file, ",");
+        }
+    }
+    else{
+        fprintf(file, "order=-1 dimsize=-1");
+    }
+    fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);
+    if(!isInit){
+        fprintf(file, "NULL");
+    }
+    if (order == 1) {
+        for (int i = 0; i < unitNum; i++) {
+            if (dataType == X_FLOAT)
+                fprintf(file, "%e ", ((float*)d)[i]);
+            else if (dataType == X_INT)
+                fprintf(file, "%d ", ((int*)d)[i]);
+            else
+                ShowNTErrors("TODO!");
+        }
+        printf("\n");
+    }
+    /* print multi-dimensional tensor */
+    else {
+        int * index = new int[order];
+        RecursionData(this, 0, index, d, file);
+        delete[] index;
+    }
+    fprintf(file, "\n");
    if (isNewData) {
        delete[](char*)d;
@@ -2184,6 +2323,11 @@ void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
                const int myDevID)
 {
+    if (tensor->mem == NULL) {
+        XMem * myMem = GMems.GetMem(myDevID);
+        tensor->mem = myMem;
+        tensor->devID = myMem->devID;
+    }
    if(tensor->mem != NULL){
        tensor->Resize(myOrder, myDimSize, myDataType, 1.0F);
    }
@@ -2488,9 +2632,8 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
        return;
    tensor->enableGrad = reference->enableGrad;
-    InitTensor(tensor, reference->order, reference->dimSize,
+    InitTensorV2(tensor, reference->order, reference->dimSize,
-               reference->dataType, reference->denseRatio,
+               reference->dataType, -1);
-               -1);
 }
 /* generate a XTensor with no initialization */
@@ -2565,9 +2708,6 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
    XTensor * tensor = NewTensor(myOrder, dims, myDataType, myDenseRatio, devID, myMem);
-    if (tensor->unitNum * tensor->unitSize == 176657664) {
-        tensor->Dump(stderr, "", 200);
-    }
    if(myMem != NULL)
        tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
    else
@@ -2593,7 +2733,7 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
    dims[0] = -abs(dims[0]);
-    XTensor * tensor = NewTensor(myOrder, dims, myDataType, 1.0F, devID);
+    XTensor * tensor = NewTensorV2(myOrder, dims, myDataType, devID);
    if (tensor->unitNum * tensor->unitSize == 176657664) {
        tensor->Dump(stderr, "", 200);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -425,6 +425,9 @@ public:
    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
+    /* dump data to a file */
+    void DumpFormat(FILE * file, const char * label = NULL, const int n = -1, const int beg = 0, const int verbose = 0);
    /* dump data to a file */
    static
@@ -459,7 +462,7 @@ extern int MakeTensorID();
 void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
 /* initialize a dense XTensor V2 */
 void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -125,7 +125,7 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
    int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");
    int stride = 1;
    int blockSizeA = 1;

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);
 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
@@ -40,7 +41,7 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
 /* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n);
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -50,7 +51,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -46,79 +46,79 @@ void _SubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
 {
    n = MODX(n, a->order);
-	CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a && b && c, "Empty tensor input!");
-	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-		          "Unmatched data types in subtraction!");
+                  "Unmatched data types in subtraction!");
-	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
    CheckDev(a->devID, b->devID);
-	if (beta == 0) {
+    if (beta == 0) {
-		_CopyValues(a, c);
+        _CopyValues(a, c);
-		return;
+        return;
-	}
+    }
-	if (XTensor::IsSameShaped(a, b)) {
+    if (XTensor::IsSameShaped(a, b)) {
-		_Sub(a, b, c, beta);
+        _Sub(a, b, c, beta);
-		return;
+        return;
-	}
+    }
-	if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
+    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-		_CudaSubDim(a, b, c, n, beta);
+        _CudaSubDim(a, b, c, n, beta);
 #else
-		ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
-	}
+    }
-	else {
+    else {
-		int stride = 1;
+        int stride = 1;
-		int blockSize = a->dimSize[n];
+        int blockSize = a->dimSize[n];
-		int blockNum = 1;
+        int blockNum = 1;
-		for (int i = a->order - 1; i >= 0; i--) {
+        for (int i = a->order - 1; i >= 0; i--) {
-			if (i > n)
+            if (i > n)
-				stride *= a->dimSize[i];
+                stride *= a->dimSize[i];
-			else if (i < n)
+            else if (i < n)
-				blockNum *= a->dimSize[i];
+                blockNum *= a->dimSize[i];
-		}
+        }
-		if (a->dataType == DEFAULT_DTYPE) {
+        if (a->dataType == DEFAULT_DTYPE) {
-			int num = a->unitNum;
+            int num = a->unitNum;
-			if (stride > 1) {
+            if (stride > 1) {
-				for (int i = 0, j = 0; i < num; i += stride, j++) {
+                for (int i = 0, j = 0; i < num; i += stride, j++) {
-					DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * ap = (DTYPE*)a->data + i;
-					DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
+                    DTYPE   bv = *((DTYPE*)b->data + j % blockSize) * beta;
-					DTYPE * cp = (DTYPE*)c->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
-					for (int k = 0; k < stride; k++)
+                    for (int k = 0; k < stride; k++)
-						cp[k] = ap[k] - bv;
+                        cp[k] = ap[k] - bv;
-				}
+                }
-			}
+            }
-			else if (stride == 1) {
+            else if (stride == 1) {
-				DTYPE * bp = (DTYPE*)b->data;
+                DTYPE * bp = (DTYPE*)b->data;
-				for (int i = 0; i < num; i += blockSize) {
+                for (int i = 0; i < num; i += blockSize) {
-					DTYPE * ap = (DTYPE*)a->data + i;
+                    DTYPE * ap = (DTYPE*)a->data + i;
-					DTYPE * cp = (DTYPE*)c->data + i;
+                    DTYPE * cp = (DTYPE*)c->data + i;
-					if (beta == 1.0F) {
+                    if (beta == 1.0F) {
-						for (int j = 0; j < blockSize; j++)
+                        for (int j = 0; j < blockSize; j++)
-							cp[j] = ap[j] - bp[j];
+                            cp[j] = ap[j] - bp[j];
-					}
+                    }
-					else {
+                    else {
-						for (int j = 0; j < blockSize; j++)
+                        for (int j = 0; j < blockSize; j++)
-							cp[j] = ap[j] - bp[j] * beta;
+                            cp[j] = ap[j] - bp[j] * beta;
-					}
+                    }
-				}
+                }
-			}
+            }
-			else {
+            else {
-				ShowNTErrors("Something is wrong!");
+                ShowNTErrors("Something is wrong!");
-			}
+            }
-		}
+        }
-		else {
+        else {
-			ShowNTErrors("TODO!");
+            ShowNTErrors("TODO!");
-		}
+        }
-	}
+    }
 }
 /*
@@ -136,7 +136,7 @@ i.e., a is subtracted with b by broadcasting
 */
 void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta)
 {
-	_SubDim(a, b, a, n, beta);
+    _SubDim(a, b, a, n, beta);
 }
 /*
@@ -155,20 +155,20 @@ i.e., a is subtracted with b by broadcasting
 */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 {
-	XTensor c(&a);
+    XTensor c(&a);
-	c.SetTMPFlag();
+    c.SetTMPFlag();
    n = MODX(n, a.order);
-	/* call _Sub function */
+    /* call _Sub function */
-	_SubDim(&a, &b, &c, n, beta);
+    _SubDim(&a, &b, &c, n, beta);
-	/* tensor connections */
+    /* tensor connections */
-	XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+    XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
-	XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHeadInt(&c, n);
-	XLink::AddParamToHead(&c, beta);
+    XLink::AddParamToHead(&c, beta);
-	return c;
+    return c;
 }
 /*
@@ -183,9 +183,8 @@ i.e., a is subtracted with b by broadcasting
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _Sub function */
    _SubDim(&a, &b, &c, n, beta);
-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -39,25 +39,25 @@ where a is a tensor and b is a row vector
 */
 template <class T, bool betaFired>
 __global__
-	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
+    void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
+    int col = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
-	if (col >= colNum || row >= rowNum)
+    if (col >= colNum || row >= rowNum)
-		return;
+        return;
-	if (threadIdx.y == 0)
+    if (threadIdx.y == 0)
-		bv[threadIdx.x] = b[col];
+        bv[threadIdx.x] = b[col];
-	__syncthreads();
+    __syncthreads();
-	int offset = colNum * row + col;
+    int offset = colNum * row + col;
-	if (betaFired)
+    if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.x] * beta;
+        c[offset] = a[offset] - bv[threadIdx.x] * beta;
-	else
+    else
-		c[offset] = a[offset] - bv[threadIdx.x];
+        c[offset] = a[offset] - bv[threadIdx.x];
 }
 /*
@@ -75,30 +75,30 @@ where a is a tensor and b is a colum vector
 */
 template <class T, bool betaFired>
 __global__
-	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
+    void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
-	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+    __shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
+    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
+    int row = blockDim.y * blockIdx.y + threadIdx.y;
-	int col = colIndex % colNum;
+    int col = colIndex % colNum;
-	int block = colIndex / colNum;
+    int block = colIndex / colNum;
-	if (row >= rowNum || block >= blockNum)
+    if (row >= rowNum || block >= blockNum)
-		return;
+        return;
-	if (threadIdx.x == 0)
+    if (threadIdx.x == 0)
-		bv[threadIdx.y] = b[row];
+        bv[threadIdx.y] = b[row];
-	__syncthreads();
+    __syncthreads();
-	int offset = block * blockSize + row * colNum + col;
+    int offset = block * blockSize + row * colNum + col;
-	if (betaFired)
+    if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.y] * beta;
+        c[offset] = a[offset] - bv[threadIdx.y] * beta;
-	else
+    else
-		c[offset] = a[offset] - bv[threadIdx.y];
+        c[offset] = a[offset] - bv[threadIdx.y];
 }
 /*
@@ -116,63 +116,63 @@ i.e., a is subtracted with b by broadcasting
 */
 void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
 {
-	CheckNTErrors(a && b && c, "Empty tensor input!");
+    CheckNTErrors(a && b && c, "Empty tensor input!");
-	CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
+    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in subtraction!");
-	CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-		          "Unmatched data types in subtraction!");
+                  "Unmatched data types in subtraction!");
-	CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
+    CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in subtraction!");
-	CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
+    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
-	CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
+    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
-	int stride = 1;
+    int stride = 1;
-	int blockSize = a->dimSize[n];
+    int blockSize = a->dimSize[n];
-	int blockNum = 1;
+    int blockNum = 1;
-	for (int i = a->order - 1; i >= 0; i--) {
+    for (int i = a->order - 1; i >= 0; i--) {
-		if (i > n)
+        if (i > n)
-			stride *= a->dimSize[i];
+            stride *= a->dimSize[i];
-		else if (i < n)
+        else if (i < n)
-			blockNum *= a->dimSize[i];
+            blockNum *= a->dimSize[i];
-	}
+    }
-	int cudaGrids[3];
+    int cudaGrids[3];
-	int cudaBlocks[3];
+    int cudaBlocks[3];
-	int devIDBackup = 0;
+    int devIDBackup = 0;
-	ProtectCudaDev(a->devID, devIDBackup);
+    ProtectCudaDev(a->devID, devIDBackup);
-	if (a->dataType == DEFAULT_DTYPE) {
+    if (a->dataType == DEFAULT_DTYPE) {
-		if (stride > 1) {
+        if (stride > 1) {
-			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
+            if (beta == (DTYPE)1.0F)
-				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-			else
+            else
-				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta);
+                                                  blockSize, stride, blockSize * stride, blockNum, beta);
-		}
+        }
-		else if (stride == 1) {
+        else if (stride == 1) {
-			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F)
+            if (beta == (DTYPE)1.0F)
-				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
+                                                  blockNum, blockSize, beta);
-			else
+            else
-				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta);
+                                                  blockNum, blockSize, beta);
-		}
+        }
-		else {
+        else {
-			ShowNTErrors("Something is wrong!");
+            ShowNTErrors("Something is wrong!");
-		}
+        }
-	}
+    }
-	else {
+    else {
-		ShowNTErrors("TODO!");
+        ShowNTErrors("TODO!");
-	}
+    }
-	BacktoCudaDev(a->devID, devIDBackup);
+    BacktoCudaDev(a->devID, devIDBackup);
 }
 #endif

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -134,12 +134,12 @@ void _IndexToOnehot(const XTensor * index, XTensor * onehot,
    for (int i = 0; i < index->order; i++)
        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
-    onehot->SetZeroAll();
+    //onehot->SetZeroAll();
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
-    //_SetDataFixedFloat(onehot, lowconfidence);
+    _SetDataFixedFloat(onehot, lowconfidence);
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -110,13 +110,11 @@ void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int 
    DTYPE * od = onehotData + i * stride;
    int id = indexData[i];
-    //od[id] = confidence;
    if (offset == id)
        od[offset] = confidence;
-    else{
+    //else
-        od[offset] = lowconfidence;
+    //    od[offset] = lowconfidence;
-    }
 }
 /* 
@@ -126,7 +124,8 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, 
+                        int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -30,6 +30,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
 /* 
 set an integer data array with a fixed value p (in int) 
 >> d - pointer to the data array
@@ -740,4 +742,5 @@ void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * va
    BacktoCudaDev(tensor->devID, devIDBackup);
 }
+#endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
@@ -36,18 +36,18 @@ set each entry to its clip value (CUDA Kernel)
 >> size - size of the data array
 */
 __global__
-	void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
+    void KernelClip(DTYPE * a, DTYPE * b, DTYPE lower, DTYPE upper, int size)
 {
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
-	if (i < size) {
+    if (i < size) {
-		if (a[i] > upper)
+        if (a[i] > upper)
-			b[i] = upper;
+            b[i] = upper;
-		else if (a[i] < lower)
+        else if (a[i] < lower)
-			b[i] = lower;
+            b[i] = lower;
-		else
+        else
-			b[i] = a[i];
+            b[i] = a[i];
-	}
+    }
 }
 /*
@@ -62,7 +62,7 @@ This is for float16 computation
 __global__
 void KernelClip(__half * a, __half * b, DTYPE lower, DTYPE upper, int size)
 {
-	return;
+    return;
 }
 /*
@@ -74,31 +74,31 @@ set each entry to its clip value
 */
 void _CudaClip(const XTensor * a, XTensor * b, DTYPE lower, DTYPE upper)
 {
-	CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-	CheckNTErrors((a->isSparse == false), "TODO!");
+    CheckNTErrors((a->isSparse == false), "TODO!");
-	int gridSize[3];
+    int gridSize[3];
-	int blockSize[3];
+    int blockSize[3];
-	GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-	dim3 blocks(gridSize[0]);
+    dim3 blocks(gridSize[0]);
-	dim3 threads(blockSize[0]);
+    dim3 threads(blockSize[0]);
-	int devIDBackup;
+    int devIDBackup;
-	ProtectCudaDev(a->devID, devIDBackup);
+    ProtectCudaDev(a->devID, devIDBackup);
-	if (a->dataType == DEFAULT_DTYPE) {
+    if (a->dataType == DEFAULT_DTYPE) {
-		KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
+        KernelClip << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, lower, upper, a->unitNum);
-	}
+    }
-	else if (a->dataType == X_FLOAT16) {
+    else if (a->dataType == X_FLOAT16) {
-		KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
+        KernelClip << <blocks, threads >> >((__half*)a->data, (__half*)b->data, lower, upper, a->unitNum);
-	}
+    }
-	else {
+    else {
-		ShowNTErrors("TODO!");
+        ShowNTErrors("TODO!");
-	}
+    }
-	BacktoCudaDev(a->devID, devIDBackup);
+    BacktoCudaDev(a->devID, devIDBackup);
 }
 /*

--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -41,19 +41,19 @@ float shflDownReduceMax(float input)
        "{"
        ".reg .f32 r0;"
        ".reg .pred p;"
-        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
        "setp.lt.f32    p,%1,r0;"
        "@p mov.f32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
        "setp.lt.f32    p,%1,r0;"
        "@p mov.f32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
        "setp.lt.f32    p,%1,r0;"
        "@p mov.f32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
        "setp.lt.f32    p,%1,r0;"
        "@p mov.f32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
        "setp.lt.f32    p, %1, r0; "
        "@p mov.f32     %1,r0;"
        "mov.f32        %0,%1;"
@@ -73,19 +73,19 @@ int shflDownReduceMax(int input)
        "{"
        ".reg .s32 r0;"
        ".reg .pred p;"
-        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
        "setp.lt.s32    p,%1,r0;"
        "@p mov.s32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
        "setp.lt.s32    p,%1,r0;"
        "@p mov.s32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
        "setp.lt.s32    p,%1,r0;"
        "@p mov.s32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
        "setp.lt.s32    p,%1,r0;"
        "@p mov.s32     %1,r0;"
-        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
        "setp.lt.s32    p, %1, r0; "
        "@p mov.s32     %1,r0;"
        "mov.s32        %0,%1;"

--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -37,15 +37,15 @@ float shflDownReduceSum(float input)
    asm volatile(
        "{"
        ".reg .f32 r0;"
-        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
        "add.f32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
        "add.f32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
        "add.f32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
        "add.f32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
        "add.f32        %0, r0, %1;"
        "}"
        : "=f"(output) : "f"(input));
@@ -62,15 +62,15 @@ int shflDownReduceSum(int input)
    asm volatile(
        "{"
        ".reg .s32 r0;"
-        "shfl.down.b32  r0, %1, 0x10, 0x1f;"
+        "shfl.sync.down.b32  r0, %1, 0x10, 0x1f,0xffffffff;"
        "add.s32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x8, 0xf;"
+        "shfl.sync.down.b32  r0, %1, 0x8, 0xf,0xffffffff;"
        "add.s32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x4, 0x7;"
+        "shfl.sync.down.b32  r0, %1, 0x4, 0x7,0xffffffff;"
        "add.s32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x2, 0x3;"
+        "shfl.sync.down.b32  r0, %1, 0x2, 0x3,0xffffffff;"
        "add.s32        %1, r0, %1;"
-        "shfl.down.b32  r0, %1, 0x1, 0x1;"
+        "shfl.sync.down.b32  r0, %1, 0x1, 0x1,0xffffffff;"
        "add.s32        %0, r0, %1;"
        "}"
        : "=r"(output) : "r"(input));

--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -35,6 +35,12 @@ keep the result in the input tensor a and return nothing
 */
 void _SortMe(XTensor * a, XTensor * index, int dim);
+/*
+sort the data along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void SortMe(XTensor & a, XTensor & index, int dim);
 /* 
 sort the data along a given dimension (return an XTensor structure)
 make a new tensor to keep the result and return it

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -171,7 +171,7 @@ float broadcast(float input)
    float output;
    asm(
        "{"
-        "shfl.idx.b32 %0,%1,0x0,0x1f;"
+        "shfl.sync.idx.b32 %0,%1,0x0,0x1f,0xffffffff;"
        "}"
        :"=f"(output) : "f"(input)
    );

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -35,7 +35,7 @@ bool Test()
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
    wrong = !TestCos() || wrong;
-    wrong = !TestConvertDataType() || wrong;
+    //wrong = !TestConvertDataType() || wrong;
    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestDiv() || wrong;
@@ -79,7 +79,7 @@ bool Test()
    wrong = !TestXMem() || wrong;
    wrong = !TestCrossEntropy() || wrong;
-	wrong = !TestDropout() || wrong;
+    wrong = !TestDropout() || wrong;
    wrong = !TestHardTanH() || wrong;
    wrong = !TestIdentity() || wrong;
    wrong = !TestLogSoftmax() || wrong;