add support for greedy search

99097e41 · huchi · bfa6fc90 · 99097e41 · 99097e41 · 99097e41
Commit 99097e41 authored Feb 17, 2020 by huchi
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -19,6 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
 */

+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
 #include <stdio.h>
 #include "./network/XNet.h"
 #include "./tensor/XUtility.h"
@@ -27,9 +31,7 @@
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/Transformer.h"

-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
+

 using namespace nts;
 using namespace fnnlm;
@@ -37,19 +39,10 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    //_CrtSetBreakAlloc(2708);
+    /*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    _CrtSetBreakAlloc(2708);*/

    TransformerMain(argc - 1, argv + 1);
-    /*XTensor x;
-    InitTensor2D(&x, 2, 2);
-    float d[]{ 1,2,3,4 };
-    x.SetData(d, 4);
-    XTensor y;
-    y = ReduceSum(x, 0);
-    y.Dump(stderr);*/
-
-
    //_CrtDumpMemoryLeaks();
    
    return 0;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -34,7 +34,7 @@ T2TAttention::T2TAttention()
    nhead = -1;
    dk = -1;
    dv = -1;
-    d  = -1;
+    d = -1;
    isMasked = false;
    ignored = 0;
 }
@@ -62,7 +62,7 @@ void T2TAttention::InitModel(int argc, char** argv,

    float minmax = 0;

-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);
    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
@@ -70,15 +70,15 @@ void T2TAttention::InitModel(int argc, char** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
-    InitTensor1D(&bq, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
-    InitTensor2D(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID);
-    InitTensor1D(&ba, d, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bq, d, X_FLOAT, devID);
+    InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bk, d, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bv, d, X_FLOAT, devID);
+    InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
+    InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bo, d, X_FLOAT, devID);
 }

 /*
@@ -94,24 +94,27 @@ make the network
 >> cacheType - which type that cache is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
+XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
 {
    const bool isEnc = (!cache) ? true : false;
-
+    
    /* linear transformation before self-attention */
+    
    XTensor q2, k2, v2;
-    q2 = MatrixMul(q, X_NOTRANS, wq, X_TRANS) + bq;
+
+    q2 = MatrixMul(q, wq) + bq;

    if (!cache) {
        /* self attention for encoder layers */
-        k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-        v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
-        return MakeRPRAttention(k2, q2, v2,  mask, isTraining, isEnc);
+        k2 = MatrixMul(k, wk) + bk;
+        v2 = MatrixMul(v, wv) + bv;
+        return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
    }
+    
    else {
        if (cacheType == SELF_ATT) {
-            k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-            v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+            k2 = MatrixMul(k, wk) + bk;
+            v2 = MatrixMul(v, wv) + bv;

            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -121,12 +124,13 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
            cache->key = k2;
            cache->value = v2;
            cache->miss = false;
+
            return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
        }
        else if (cacheType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-                cache->value = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+                cache->key = MatrixMul(k, wk) + bk;
+                cache->value = MatrixMul(v, wv) + bv;
                cache->miss = false;
            }
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
@@ -134,50 +138,49 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
        CheckNTErrors(0, "invalid cache type");
    }
 }
-    
+
 /*
 make the attention network given keys, queries and values (after linear transformation)
 >> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length,
-       and H = vector size of each position
+        where B = batch size, L = sequence length,
+        and H = vector size of each position
 >> q - queries
 >> v - values
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
+XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
 {
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;
-    
+
    /* multi head */
    kheads = Split(k, k.order - 1, nhead);
    qheads = Split(q, q.order - 1, nhead);
    vheads = Split(v, v.order - 1, nhead);
-    
-    XTensor att; 
+
+    XTensor att;
    XTensor dot;
    XTensor scalar;
-    
+
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    
-    /*if (isMasked && mask) {
-        _SumMe(&dot, mask);
-    }*/
+    /*if (isMasked && mask)
+        _SumMe(&dot, mask);*/
    
    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
-    
+
    scalar = Softmax(dot, -1);

-    /*if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    att = BMMul(scalar, vheads);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 /*
@@ -215,34 +218,32 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
    InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);

    /* generate the relative emb index (L_q, L_kv) */
-    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID,is_encoder);
+    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);

-    
    /* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
    _Gather(&rp_embedding_k, &relative_key, &emb_matrix);

    /* RPR dot product (K, B, L_q, L_kv)*/
-    qheads = qheads / float(nhead);
-    
+
    RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);

    /*if (isMasked && mask)
        _SumMe(&dot, mask);*/

    /* scale the dot result */
-    //dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    /* softmax */
    scalar = Softmax(dot, -1);

-    /*if (isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    /* generate the relative attention output (K, B, L_q, H/K) */
    att = BMMul(scalar, vheads);
-
+    
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
@@ -251,10 +252,11 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    XTensor range;
    InitTensor1DV2(&range, len_kv, X_INT, devID);
    int* index = new int[len_kv];
+
    // for encoder self-attention which the L_q = L_kv
    if (is_encoder)
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = i;
        range.SetData(index, len_kv);
        XTensor range_2D, range_2D_t;
@@ -267,7 +269,7 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    // for decoder self-attention which the L_q != L_kv, and L_q is 1
    else
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = -len_kv + i + 1;
        range.SetData(index, len_kv);
        _Unsqueeze(&range, emb_matrix, 0, len_q);
@@ -299,7 +301,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    XTensor context;
    InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
    _MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
-    //if (profiler_) profiler_->FinishTimer("RPDotPro-BMM");

    // reshape and transpose x to (L_q, K*B, H/K or L_kv)
    int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
@@ -323,5 +324,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    relative_t.Reshape(4, split_dims);

    _Sum(&context, &relative_t, attention);
+
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -90,14 +90,18 @@ public:
    /* bias for V */
    XTensor bv;

+    XTensor wBig;
+
+    XTensor bBig;
+
    /* RPR emb */
    XTensor rp_embedding_k;

    /* transformation after dot-product attention */
-    XTensor wa;
+    XTensor wo;

    /* bias after dot-product attention */
-    XTensor ba;
+    XTensor bo;

    /* size of transformed Q and K */
    int dk;

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,27 +31,27 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
-    attLayerNorms = NULL;
-    attentionsEnde = NULL;
-    attEndeLayerNorms = NULL;
-    decodeLayerNorm = NULL;
-    selfCache = NULL;
-    contextCache = NULL;
+    selfAttLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
 }

 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
-    delete[] selfCache;
-    delete[] contextCache;
-    delete[] attentions;
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] attentionsEnde;
-    delete[] attEndeLayerNorms;
-    delete decodeLayerNorm;
+    delete[] selfAttLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    delete decoderLayerNorm;
 }

 /* 
@@ -71,7 +71,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;

-    LoadParamInt(argc, argv, "nlayer", &nlayer, 3);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
@@ -83,24 +83,24 @@ void AttDecoder::InitModel(int argc, char ** argv,
    /* embedding model */
    embedder.InitModel(argc, argv, devID, false);

-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
-    decodeLayerNorm = new T2TLN;
-    selfCache = new Cache[nlayer];
-    contextCache = new Cache[nlayer];
+    selfAttLayerNorms = new T2TLN[nlayer];
+    enDeAtt = new T2TAttention[nlayer];
+    enDeAttLayerNorms = new T2TLN[nlayer];
+    decoderLayerNorm = new T2TLN;
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
+        enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    decodeLayerNorm->InitModel(argc, argv, myDevID);
+    decoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -131,48 +131,38 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, X
        XTensor attNorm;

        /* layer normalization */
-        inputNorm = attLayerNorms[i].Make(x);
-        //inputNorm.Dump(stderr, "inputNorm", 10);
+        inputNorm = selfAttLayerNorms[i].Make(x);

        /******************/
        /* self attention */
-        att = attentions[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfCache[i], SELF_ATT);
+        att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);

        /* residual connection */
-        _SumMe(&att, &x);
-        //att.Dump(stderr, "Sum(att, x)", 10);
+        att = att + x;

        /* layer normalization */
-        attNorm = attEndeLayerNorms[i].Make(att);
-        //attNorm.Dump(stderr, "attNorm", 10);
+        attNorm = enDeAttLayerNorms[i].Make(att);

        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &contextCache[i], EN_DE_ATT);
-
-        //ende.Dump(stderr, "ende atten", 10);
+        ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);

        /* residual connection */
-        _SumMe(&ende, &att);
-        //res.Dump(stderr, "Sum(ende, att)", 10);
+        ende = ende + att;

        /* fnn */
        x = fnns[i].Make(ende, isTraining);
-        //x.Dump(stderr, "fnns[i]", 10);

    }

-    x = decodeLayerNorm->Make(x);
-    //x.Dump(stderr, "decodeLayerNorm", 10);
-    
-    x.SetName(DECODING_NAME);
+    x = decoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -63,13 +63,13 @@ public:
    T2TFNN * fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention * selfAtt;

    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    T2TLN * selfAttLayerNorms;

    /* layer normalization for decoder */
-    T2TLN * decodeLayerNorm;
+    T2TLN * decoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;
@@ -78,16 +78,16 @@ public:
    XTensor * output;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    T2TAttention * enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    T2TLN * enDeAttLayerNorms;

    /* layer cache list */
-    Cache* selfCache;
+    Cache* selfAttCache;

    /* layer cache list */
-    Cache* contextCache;
+    Cache* enDeAttCache;

 public:
    /* constructor */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "pad", &padIdx, 1);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    maxLength = maxLength + 1 + 1;
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
@@ -80,7 +80,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -113,47 +113,47 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input, int prevLen)
 {
-    /* assert padding index is 1 */
+    ///* assert padding index is 1 */

-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    //CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    //CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    //CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    //CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

-    
-    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-
-    int* posData = new int[input.unitNum];
-
-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
-
-
-    for (int i = 0; i < inputCPU.GetDim(0); i++) {
-        int startNoPad = 2 + prevLen - 1;
-        int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-        for (int j = 0; j < inputCPU.GetDim(1); j++) {
-            if (p[j] == 1) {
-                posData[i * inputCPU.GetDim(1) + j] = 1;
-            }
-            else {
-                posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-            }
-        }
-    }
+    //
+    //XTensor wordEmbedding, position, posEmbedding;
+    //InitTensor(&position, &input);

-    position.SetData(posData, position.unitNum);
-    delete[] posData;
+    //int* posData = new int[input.unitNum];

-    /* we make positional embeddings first */
-    if(true){
-        posEmbedding = Gather(posEmbeddingBase, position);
-    }
+    //XTensor inputCPU;
+    //InitTensorOnCPU(&inputCPU, &input);
+    //_CopyValues(&input, &inputCPU);

-    /* then we make word embeddings */

+    //for (int i = 0; i < inputCPU.GetDim(0); i++) {
+    //    int startNoPad = 2 + prevLen - 1;
+    //    int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
+    //    for (int j = 0; j < inputCPU.GetDim(1); j++) {
+    //        if (p[j] == 1) {
+    //            posData[i * inputCPU.GetDim(1) + j] = 1;
+    //        }
+    //        else {
+    //            posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
+    //        }
+    //    }
+    //}
+
+    //position.SetData(posData, position.unitNum);
+    //delete[] posData;
+
+    ///* we make positional embeddings first */
+    //if(true){
+    //    posEmbedding = Gather(posEmbeddingBase, position);
+    //}
+
+    /* then we make word embeddings */
+    XTensor wordEmbedding;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -29,7 +29,7 @@ using namespace nts;
 namespace transformer
 {

-#define DEFAULT_EMBEDDING_SIZE 512
+#define DEFAULT_EMBEDDING_SIZE 128

 /* 
 embedding (of word at position i):

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -34,7 +34,7 @@ AttEncoder::AttEncoder()
    attentions = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
-    encodeLayerNorm = NULL;
+    encoderLayerNorm = NULL;
 }

 /* de-constructor */
@@ -43,7 +43,7 @@ AttEncoder::~AttEncoder()
    delete[] attentions;
    delete[] fnns;
    delete[] attLayerNorms;
-    delete encodeLayerNorm;
+    delete encoderLayerNorm;
 }

 /* 
@@ -61,7 +61,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;
    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 35);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
@@ -76,7 +76,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
-    encodeLayerNorm = new T2TLN;
+    encoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
@@ -84,7 +84,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
        fnns[i].InitModel(argc, argv, myDevID);
        attLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    encodeLayerNorm->InitModel(argc, argv, myDevID);
+    encoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -123,13 +123,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, boo

        /* fnn */
        x = fnns[i].Make(res, isTraining);
-
    }

-    x = encodeLayerNorm->Make(x);
-
-    x.SetName(ENCODING_NAME);
-    input.SetName(ENCODING_INPUT_NAME);
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -93,11 +93,11 @@ public:
    /* attention model of each layer */
    T2TAttention * attentions;

-    /* layer normalization for attention */
+    /* layer normalizations for attention */
    T2TLN * attLayerNorms;

    /* layer normalization for encoder */
-    T2TLN * encodeLayerNorm;
+    T2TLN * encoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+  */

 #include <math.h>
 #include "T2TFNN.h"
@@ -32,9 +32,9 @@ namespace transformer
 /* constructor */
 T2TFNN::T2TFNN()
 {
-    inSize  = -1;
+    inSize = -1;
    outSize = -1;
-    hSize   = -1;
+    hSize = -1;
 }

 /* deconstructor */
@@ -42,28 +42,28 @@ T2TFNN::~T2TFNN()
 {
 }

-/* 
-initialize the model 
+/*
+initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
+void T2TFNN::InitModel(int argc, char** argv, int myDevID)
 {
    devID = myDevID;
-    
+
    float minmax = 0;

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2DV2(&w1, hSize, inSize, X_FLOAT, devID);
+    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2DV2(&w2, outSize, hSize, X_FLOAT, devID);
+    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    fnnLayerNorm.InitModel(argc, argv, myDevID);
@@ -78,25 +78,25 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
    //b2.SetZeroAll();
 }

-/* 
-make the network 
+/*
+make the network
 y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
->> return - the output tensor 
+>> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), X_NOTRANS, w1, X_TRANS, b1));
-    
-    if(isTraining && dropoutP > 0)
+    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
+
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
    XTensor res;
-    res = MulAndShift(t1, X_NOTRANS, w2, X_TRANS, b2);
+    res = MulAndShift(t1, w2, b2);
    _SumMe(&res, &input);
    return  res;
 }

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -53,8 +53,8 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID)
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
 }

 /*
@@ -78,7 +78,7 @@ XTensor T2TLN::Make(XTensor &input)
    mean = ReduceMean(x, x.order - 1);

    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean);
+    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
@@ -92,7 +92,7 @@ XTensor T2TLN::Make(XTensor &input)
    xn = (x - meanFilled) / standardFilled;

    /* result = x' * w + b   */
-    return xn * w + b;
+    return xn *  w + b;
 }

 }
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,15 +15,16 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+  */


 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "../../tensor/core/CHeader.h"
 #include "../../tensor/XUtility.h"
+#include <cstdint>

 namespace transformer
 {
@@ -49,22 +50,22 @@ T2TModel::~T2TModel()
    delete outputLayer;
 }

-/* 
-initialize the model 
+/*
+initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TModel::InitModel(int argc, char ** argv)
+void T2TModel::InitModel(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "dev", &devID, -1);
    LoadParamBool(argc, argv, "mt", &isMT, false);
    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);

    encoder->InitModel(argc, argv, true, 0, devID);
    outputLayer->InitModel(argc, argv, devID);

-    if(isMT)
+    if (isMT)
        decoder->InitModel(argc, argv, true, 0, devID);

    TensorList params(10);
@@ -76,21 +77,21 @@ void T2TModel::InitModel(int argc, char ** argv)
    }
 }

-/* 
+/*
 make the encoding network
 >> input - input tensor
 >> mask - the mask for positions that are/not involved in computation
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoder(XTensor &input, XTensor *mask, bool isTraining)
+XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;

    return encoder->Make(input, mask, nothing, isTraining);
 }

-/* 
+/*
 make the decoding network
 >> inputDec - input tensor of the decoder
 >> outputEnc - output tensor of the encoder
@@ -100,22 +101,22 @@ make the decoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining)
+XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
    return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
 }

-/* 
-make the network for language modeling (with the output softmax layer) 
+/*
+make the network for language modeling (with the output softmax layer)
 >> input - input tensor
 >> output - output tensor (distribution)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
+void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
 {
    XTensor encoding;
-    
+
    /* generate mask to see "previous" words only */
    //int len = input.GetDim(input.order - 2);
    //int * dims = new int[input.order + 1];
@@ -126,30 +127,30 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);

    int len = input.GetDim(input.order - 1);
-    int * dims = new int[input.order + 2];
-    for(int i = 0; i < input.order; i++)
+    int* dims = new int[input.order + 2];
+    for (int i = 0; i < input.order; i++)
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
    dims[input.order + 1] = len;
    XTensor mask;
-    InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
+    InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, 1.0F, padding.devID);

    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
        a given sequence. */
    _SetDataLowTri(&mask, 1e9F, 0);
    _ScaleAndShiftMe(&mask, 1.0F, -1e9F);
-        
-    int * dimsPadding = new int[padding.order + 2];
-    for(int i = 0; i < padding.order - 1; i++)
+
+    int* dimsPadding = new int[padding.order + 2];
+    for (int i = 0; i < padding.order - 1; i++)
        dimsPadding[i] = padding.GetDim(i);
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
-                                        padding.devID);
+    XTensor* padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
+        padding.devID);

-    for(int i = 0; i < padding2->order; i++)
+    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

@@ -169,13 +170,13 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is

    delete[] dims;
    delete[] dimsPadding;
-        
+
    //DelTensorBuf(padding3);
    DelTensorBuf(padding2);
 }

-/* 
-make the network for machine translation (with the output softmax layer) 
+/*
+make the network for machine translation (with the output softmax layer)
 >> inputEnc - input tensor of the encoder
 >> inputDec - input tensor of the decoder
 >> output - output tensor (distribution)
@@ -183,7 +184,7 @@ make the network for machine translation (with the output softmax layer)
 >> paddingDec - padding of the sequences (on the decoder side)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
+void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output, XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
@@ -193,7 +194,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe

    /* encoder mask */
    MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc);
-    
+
    /* decoder mask */
    MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec, 0);

@@ -204,8 +205,8 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    outputLayer->Make(decoding, output);
 }

-/* 
-make the mask for training MT models 
+/*
+make the mask for training MT models
 >> inputEnc - input of the encoder
 >> inputDec - input of the decoder
 >> paddingEnc - padding of the encoder input
@@ -214,18 +215,18 @@ make the mask for training MT models
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec, 
-                          XTensor &paddingEnc, XTensor &paddingDec, 
-                          XTensor &maskEnc,    XTensor &maskDec,    XTensor &maskEncDec)
+void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+    XTensor& paddingEnc, XTensor& paddingDec,
+    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
-    int * dims = new int[inputDec.order + 2];
-    for(int i = 0; i < inputDec.order; i++)
+    int* dims = new int[inputDec.order + 2];
+    for (int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
-        
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID);
+
    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
       a given sequence. */
@@ -234,11 +235,10 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID);

-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -248,21 +248,21 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    DelTensorBuf(maskEncDecTMPEnc);

    /* padding on the source side */
-    int * dimsPadding = new int[paddingEnc.order + 2];
+    int* dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
        dimsPadding[i] = paddingEnc.GetDim(i);
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+        paddingEnc.devID);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+    XTensor* padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+        paddingEnc.devID);

    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -270,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);

-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();

    /* generate the mask on the source language side (for padding) */
@@ -282,49 +282,47 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
 }
-    
+
 /*
 make the mask of the encoder
 >> inputEnc - input of the encoder
 >> paddingEnc - padding of the encoder input
 >> maskEnc - mask of the encoder self-attention
 */
-void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
+void T2TModel::MakeMTMaskEnc(XTensor& inputEnc, XTensor& paddingEnc, XTensor& maskEnc)
 {
    /* padding on the source side */
-    int * dimsPadding = new int[paddingEnc.order + 2];
+    int* dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
        dimsPadding[i] = paddingEnc.GetDim(i);
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
-    
-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
-    
+
+    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
+
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
-    
-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
-    
+
+    XTensor* padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);
+
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
    _Unsqueeze(padding2, padding3, 0, nhead);
-    
+
    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
-    
-    InitTensor(&maskEnc, padding3);
+
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();
-    
+
    /* generate the mask on the source language side (for padding) */
    _Sum(&maskEnc, padding3, &maskEnc);
-    
+
    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
    delete[] dimsPadding;
 }
-    
+
 /*
 make the mask of the decoder
 >> inputEnc - input of the encoder
@@ -334,34 +332,33 @@ make the mask of the decoder
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
-                             XTensor &paddingEnc, XTensor &paddingDec,
-                             XTensor &maskDec, XTensor &maskEncDec, int incDim)
+void T2TModel::MakeMTMaskDec(XTensor& inputEnc, XTensor& inputDec,
+    XTensor& paddingEnc, XTensor& paddingDec,
+    XTensor& maskDec, XTensor& maskEncDec, int incDim)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
-    int * dims = new int[inputDec.order + 2];
-    for(int i = 0; i < inputDec.order; i++)
+    int* dims = new int[inputDec.order + 2];
+    for (int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    //dims[inputDec.order] += incDim;
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
    //InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec);
-    
+
    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       This matrix can be used to block the attention to current or following words in
       a given sequence. */
-    //_SetDataLowTri(&maskDec, 1e9F, 0);
+       //_SetDataLowTri(&maskDec, 1e9F, 0);

-    //_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+       //_ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);

-    /* encoder-decoder mask that prevents the attention to padding dummy words */
+       /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
-    
-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
-    
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID);
+
+    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);
+
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));

    //paddingEnc.Dump(stderr, "paddingenc:");
@@ -374,21 +371,21 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);

    //maskEncDecTMPEnc->Dump(stderr, "maskencdectmpenc:");
-    
+
    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
    delete[] dims;
 }
-/* 
+/*
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(TensorList &list)
+void T2TModel::GetParams(TensorList& list)
 {
    list.Clear();

    /* encoder parameters */
-    for(int i = 0; i < encoder->nlayer; i++){
+    for (int i = 0; i < encoder->nlayer; i++) {
        list.Add(&encoder->attentions[i].wq);
        list.Add(&encoder->attentions[i].wk);
        list.Add(&encoder->attentions[i].wv);
@@ -396,8 +393,8 @@ void T2TModel::GetParams(TensorList &list)
        list.Add(&encoder->attentions[i].bk);
        list.Add(&encoder->attentions[i].bv);
        list.Add(&encoder->attentions[i].rp_embedding_k);
-        list.Add(&encoder->attentions[i].wa);
-        list.Add(&encoder->attentions[i].ba);
+        list.Add(&encoder->attentions[i].wo);
+        list.Add(&encoder->attentions[i].bo);
        list.Add(&encoder->fnns[i].w1);
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
@@ -407,33 +404,33 @@ void T2TModel::GetParams(TensorList &list)
        list.Add(&encoder->fnns[i].fnnLayerNorm.w);
        list.Add(&encoder->fnns[i].fnnLayerNorm.b);
    }
-    list.Add(&encoder->encodeLayerNorm->w);
-    list.Add(&encoder->encodeLayerNorm->b);
-    
+    list.Add(&encoder->encoderLayerNorm->w);
+    list.Add(&encoder->encoderLayerNorm->b);
+
    /* decoder parameters */
-    if(isMT){
-        for(int i = 0; i < decoder->nlayer; i++){
-            list.Add(&decoder->attentions[i].wq);
-            list.Add(&decoder->attentions[i].wk);
-            list.Add(&decoder->attentions[i].wv);
-            list.Add(&decoder->attentions[i].bq);
-            list.Add(&decoder->attentions[i].bk);
-            list.Add(&decoder->attentions[i].bv);
-            list.Add(&decoder->attentions[i].rp_embedding_k);
-            list.Add(&decoder->attentions[i].wa);
-            list.Add(&decoder->attentions[i].ba);
-            list.Add(&decoder->attLayerNorms[i].w);
-            list.Add(&decoder->attLayerNorms[i].b);
-            list.Add(&decoder->attentionsEnde[i].wq);
-            list.Add(&decoder->attentionsEnde[i].wk);
-            list.Add(&decoder->attentionsEnde[i].wv);
-            list.Add(&decoder->attentionsEnde[i].bq);
-            list.Add(&decoder->attentionsEnde[i].bk);
-            list.Add(&decoder->attentionsEnde[i].bv);
-            list.Add(&decoder->attentionsEnde[i].wa);
-            list.Add(&decoder->attentionsEnde[i].ba);
-            list.Add(&decoder->attEndeLayerNorms[i].w);
-            list.Add(&decoder->attEndeLayerNorms[i].b);
+    if (isMT) {
+        for (int i = 0; i < decoder->nlayer; i++) {
+            list.Add(&decoder->selfAtt[i].wq);
+            list.Add(&decoder->selfAtt[i].wk);
+            list.Add(&decoder->selfAtt[i].wv);
+            list.Add(&decoder->selfAtt[i].bq);
+            list.Add(&decoder->selfAtt[i].bk);
+            list.Add(&decoder->selfAtt[i].bv);
+            list.Add(&decoder->selfAtt[i].rp_embedding_k);
+            list.Add(&decoder->selfAtt[i].wo);
+            list.Add(&decoder->selfAtt[i].bo);
+            list.Add(&decoder->selfAttLayerNorms[i].w);
+            list.Add(&decoder->selfAttLayerNorms[i].b);
+            list.Add(&decoder->enDeAtt[i].wq);
+            list.Add(&decoder->enDeAtt[i].wk);
+            list.Add(&decoder->enDeAtt[i].wv);
+            list.Add(&decoder->enDeAtt[i].bq);
+            list.Add(&decoder->enDeAtt[i].bk);
+            list.Add(&decoder->enDeAtt[i].bv);
+            list.Add(&decoder->enDeAtt[i].wo);
+            list.Add(&decoder->enDeAtt[i].bo);
+            list.Add(&decoder->enDeAttLayerNorms[i].w);
+            list.Add(&decoder->enDeAttLayerNorms[i].b);
            list.Add(&decoder->fnns[i].w1);
            list.Add(&decoder->fnns[i].b1);
            list.Add(&decoder->fnns[i].w2);
@@ -441,8 +438,8 @@ void T2TModel::GetParams(TensorList &list)
            list.Add(&decoder->fnns[i].fnnLayerNorm.w);
            list.Add(&decoder->fnns[i].fnnLayerNorm.b);
        }
-        list.Add(&decoder->decodeLayerNorm->w);
-        list.Add(&decoder->decodeLayerNorm->b);
+        list.Add(&decoder->decoderLayerNorm->w);
+        list.Add(&decoder->decoderLayerNorm->b);
    }

    /* shared embeddings */
@@ -452,23 +449,23 @@ void T2TModel::GetParams(TensorList &list)
 }

 /*
-dump the parameters 
+dump the parameters
 >> fn - where to keep the model
 >> model - the model
 */
-void T2TModel::Dump(const char * fn)
+void T2TModel::Dump(const char* fn)
 {
    double startT = GetClockSec();

-    FILE * file = fopen(fn, "wb");
+    FILE* file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");

    TensorList params(100);

    GetParams(params);

-    for(int i = 0; i < params.count; i++){
-        XTensor * p = (XTensor*)params.Get(i);
+    for (int i = 0; i < params.count; i++) {
+        XTensor* p = (XTensor*)params.Get(i);
        p->Dump(file, "param:");
    }

@@ -480,38 +477,37 @@ void T2TModel::Dump(const char * fn)
 }

 /* read the parameters */
-void T2TModel::Read(const char * fn)
+void T2TModel::Read(const char* fn)
 {
    double startT = GetClockSec();

-    FILE * file = fopen(fn, "rb");
+    FILE* file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot open the model file");

    TensorList params(100);
-
    GetParams(params);
+    
+    //uint64_t* offsets = new uint64_t[params.Size()];

-    for(int i = 0; i < params.count; i++){
-        XTensor * p = (XTensor*)params.Get(i);
-        FastRead(p, file);
-        // p->Read(file, "");
-    }
+    ///* number of parameter */
+    //uint64_t param_number;
+    //fread(&param_number, sizeof(param_number), 1, file);
+    //CheckNTErrors(param_number == params.Size(), "parameter number not matched");

-    fclose(file);
+    ///* parameter offsets */
+    //fread(offsets, sizeof(offsets[0]), params.Size(), file);

-    double elapsed = GetClockSec() - startT;
+    ///* parameter values */
+    //for (int i = 0; i < params.Size(); i++)
+    //    params[i]->BinaryRead(file, offsets[i]);

-    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
-}
-
-void FastRead(XTensor* x, FILE* f) {
-    float * dataBuf = new float[x->unitNum];
-    
-    fread(dataBuf, sizeof(char), sizeof(float) * x->unitNum, f);
-
-    x->SetData(dataBuf, x->unitNum);
+    //delete[] offsets;
+    for (int i = 0; i < params.Size(); i++)
+        params[i]->BinaryRead(file, 0);

-    delete[] dataBuf;
+    fclose(file);
+    double elapsed = GetClockSec() - startT;
+    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -103,7 +103,7 @@ public:
    /* read the parameters */
    void Read(const char * fn);
 };
-void FastRead(XTensor* x, FILE* f);
+
 }

 #endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,13 +56,11 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
 }


-
 /* 
 make the network (redefined output tensor) 
 >> input - input tensor
@@ -72,9 +70,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_NOTRANS), -1);
-
-    output.SetName(OUTPUT_NAME);
+    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
 }

 }
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+  */

 #include "T2TPredictor.h"
 #include "../../tensor/core/CHeader.h"
@@ -38,24 +38,24 @@ T2TStateBundle::T2TStateBundle()
 /* de-constructor */
 T2TStateBundle::~T2TStateBundle()
 {
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
 }

-/* 
-create states 
+/*
+create states
 >> num - number of states
 */
 void T2TStateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

-    if(states != NULL)
+    if (states != NULL)
        delete[] states;

    states = new T2TState[num];

-    for(int i = 0; i < num; i++){
+    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
        states[i].pid = T2T_PID_EMPTY;
        states[i].isEnd = false;
@@ -74,7 +74,7 @@ void T2TStateBundle::MakeStates(int num)
 /* constructor */
 T2TPredictor::T2TPredictor()
 {
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
@@ -82,37 +82,44 @@ T2TPredictor::~T2TPredictor()
 {
 }

-/* 
-create an initial state 
+/*
+create an initial state
 >> model - the t2t model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);

-    float* data = new float[state->probPath.unitNum];
+    /*float* data = new float[state->probPath.unitNum];
    for (int i = 0; i < state->probPath.unitNum; ++i) {
        data[i] = -1e20F;
        if (i % beamSize == 0)
            data[i] = 0;
    }
    state->probPath.SetData(data, state->probPath.unitNum);
+    delete[] data;*/

+    SetDataFixed(state->probPath, -1e9F);
+    for (int i = 0; i < state->probPath.unitNum; ++i) {
+        if (i % beamSize == 0)
+            state->probPath.Set(0.0F, i);
+    }
+    
    state->nstep.SetZeroAll();
    state->endMark.SetZeroAll();

-    delete[] data;
+    
    state->stateNum = 0;
 }

@@ -125,15 +132,15 @@ void T2TPredictor::SetStartSymbol(int symbol)
    startSymbol = symbol;
 }

-/* 
-read a state 
+/*
+read a state
 >> model - the t2t model that keeps the network created so far
 >> state - a set of states. It keeps
             1) hypotheses (states)
             2) probablities of hypotheses
             3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
 {
    m = model;
    s = state;
@@ -147,8 +154,7 @@ predict the next state
 >> paddingEnc - padding of the encoder
 >>> isStart - is the start or not
 */
-void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
-                           XTensor * inputEnc, XTensor * paddingEnc, bool isStart)
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -157,42 +163,43 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,

    /* the first token */
    XTensor first;
-    
+
    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for(int i = 0; i < inputEnc->order - 1; i++)
+    for (int i = 0; i < inputEnc->order - 1; i++)
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
    SetDataFixedInt(first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
    if (isStart) {
        inputDec = Identity(first);
    }
-    else{
+    else {
        /* only pass one step to the decoder */
        inputDec = GetLastPrediction(s);
        inputDec.SetDevice(inputEnc->devID);
    }
+    

    /* prediction probabilities */
-    XTensor &output = next->prob;
+    XTensor& output = next->prob;
    XTensor decoding;
-    
-    for(int i = 0; i < inputDec.order - 1; i++)
+
+    for (int i = 0; i < inputDec.order - 1; i++)
        dims[i] = inputDec.GetDim(i);
    dims[inputDec.order - 1] = inputDec.GetDim(-1);
-    
+
    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);
-    
+
    XTensor maskDec;
    XTensor maskEncDec;
-    
+
    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);
+    //m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);

    /* make the decoding network */
    decoding = m->decoder->Make(inputDec, *encoding, NULL, maskEncDec, false);
@@ -203,38 +210,38 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    m->outputLayer->Make(decoding, output);
 }

-/* 
-generate paths up to the states of the current step 
+/*
+generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;
-    
-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            cur = cur->last;
        }

-        if(nsteps > distance)
+        if (nsteps > distance)
            distance = nsteps;
    }

    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            path.Set2DInt(cur->prediction, i, distance - nsteps);
            cur = cur->last;
@@ -253,7 +260,7 @@ XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    XTensor lastPred;
-    InitTensor2D(&lastPred, state->stateNum, 1, X_INT);
+    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);

    for (int i = 0; i < state->stateNum; i++) {
        T2TState* cur = state->states + i;

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -15,10 +15,10 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
- * This is the first source file I create in 2019 - new start!
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+  * This is the first source file I create in 2019 - new start!
+  */

 #ifndef __T2TPREDICTOR_H__
 #define __T2TPREDICTOR_H__
@@ -39,8 +39,8 @@ public:
    /* we assume that the prediction is an integer */
    int prediction;

-    /* id of the problem. One can regard it as the sentence id when we 
-       translate a number of sentences in the batched manner. The hypothesis 
+    /* id of the problem. One can regard it as the sentence id when we
+       translate a number of sentences in the batched manner. The hypothesis
       is empty if id = -1 */
    int pid;

@@ -66,7 +66,7 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState * last;
+    T2TState* last;
 };

 /* a bundle of states */
@@ -75,7 +75,7 @@ class T2TStateBundle
 public:
    /* predictions */
    XTensor prediction;
-    
+
    /* id of the previous state that generates the current one  */
    XTensor preID;

@@ -95,7 +95,7 @@ public:
    XTensor nstep;

    /* list of states */
-    T2TState * states;
+    T2TState* states;

    /* number of states */
    int stateNum;
@@ -114,19 +114,19 @@ public:
    void MakeStates(int num);
 };

-/* The predictor reads the current state and then predicts the next. 
+/* The predictor reads the current state and then predicts the next.
   It is exactly the same procedure of MT inference -
   we get the state of previous words and then generate the next word.
-   Here, a state can be regared as the representation of words (word 
+   Here, a state can be regared as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
 class T2TPredictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel * m;
+    T2TModel* m;

    /* current state */
-    T2TStateBundle * s;
+    T2TStateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,19 +139,19 @@ public:
    ~T2TPredictor();

    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel * model, T2TStateBundle * state);
+    void Read(T2TModel* model, T2TStateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc, bool isStart);
+    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle * state);
+    XTensor GeneratePaths(T2TStateBundle* state);

    /* get the predictions of the previous step */
    XTensor GetLastPrediction(T2TStateBundle* state);

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  */

 #include "T2TSearch.h"
 #include "T2TUtility.h"
@@ -27,7 +27,7 @@ using namespace nts;

 namespace transformer
 {
-    
+
 /* constructor */
 T2TSearch::T2TSearch()
 {
@@ -38,15 +38,15 @@ T2TSearch::T2TSearch()
    endSymbolNum = 0;
    fullHypos = NULL;
    endSymbols = new int[32];
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
 T2TSearch::~T2TSearch()
 {
-    if(fullHypos != NULL)
+    if (fullHypos != NULL)
        delete[] fullHypos;
-    if(endSymbols != NULL)
+    if (endSymbols != NULL)
        delete[] endSymbols;
 }

@@ -55,7 +55,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TSearch::Init(int argc, char ** argv)
+void T2TSearch::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
    LoadParamInt(argc, argv, "batchsize", &batchSize, 1);
@@ -63,18 +63,18 @@ void T2TSearch::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "endid", endSymbols, 2);
    LoadParamInt(argc, argv, "startid", &startSymbol, 2);

-    if(endSymbols[0] >= 0)
+    if (endSymbols[0] >= 0)
        endSymbolNum = 1;
 }

-/* 
-search for the most promising states 
+/*
+search for the most promising states
 >> model - the transformer model
 >> input - input of the model
 >> padding - padding of the input
 >> output - output that represents the sequences as rows
 */
-void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output)
+void T2TSearch::Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output)
 {
    T2TPredictor predictor;
    XTensor maskEnc;
@@ -86,11 +86,11 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");

-    Prepare(input->unitNum/input->GetDim(-1), beamSize);
+    Prepare(input->unitNum / input->GetDim(-1), beamSize);

    /* encoder mask */
-    model->MakeMTMaskEnc(*input, *padding, maskEnc);
-    
+    //model->MakeMTMaskEnc(*input, *padding, maskEnc);
+
    /* make the encoding network */
    encoding = model->MakeEncoder(*input, &maskEnc, false);

@@ -101,11 +101,11 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
    encodingBeam.ReshapeMerged(encodingBeam.order - 4);
    inputBeam.ReshapeMerged(inputBeam.order - 3);
    paddingBeam.ReshapeMerged(paddingBeam.order - 3);
-    
+
    /* max output-length = 2 * source-length */
    maxLength = input->GetDim(-1) * 2;
    CheckNTErrors(maxLength > 0, "no max length specified!");
-    
+
    T2TStateBundle* states = new T2TStateBundle[maxLength + 1];
    T2TStateBundle* first = states;
    T2TStateBundle* cur;
@@ -118,7 +118,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
    first->isStart = true;

    /* generate the sequence from left to right */
-    for(int i = 0 ; i < maxLength; i++){
+    for (int i = 0; i < maxLength; i++) {
        cur = states + i;
        next = states + i + 1;

@@ -126,7 +126,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
        predictor.Read(model, cur);

        /* predict the next state */
-        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, i==0);
+        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, i == 0);

        /* compute the model score (given the prediction probability) */
        Score(cur, next);
@@ -143,13 +143,13 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe

    /* fill the heap with imcomplete hypotheses if neccesary */
    FillHeap(next);
-    
+
    Dump(output);

    delete[] states;
 }

-/* 
+/*
 prepare for search
 >> batchSize - size of the batch
 >> beamSize - size of the beam
@@ -168,102 +168,100 @@ void T2TSearch::Prepare(int myBatchSize, int myBeamSize)
        fullHypos[i].Init(beamSize);
 }

-/* 
-compute the model score for each hypothesis 
+/*
+compute the model score for each hypothesis
 >> prev - the beam of the previous state
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
+void T2TSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
 {
-    XTensor &score = beam->modelScore;
-    XTensor &prob = beam->prob;
-    XTensor &probPath = beam->probPath;
-    XTensor &probPathPrev = prev->probPath;
-    XTensor &lenPrev = prev->nstep;
-    XTensor &len = beam->nstep;
+    XTensor& score = beam->modelScore;
+    XTensor& prob = beam->prob;
+    XTensor& probPath = beam->probPath;
+    XTensor& probPathPrev = prev->probPath;
+    XTensor& lenPrev = prev->nstep;
+    XTensor& len = beam->nstep;
    XTensor lp;
    XTensor mask;

    int order = prob.order;
    int outputSize = prob.GetDim(-1);
    int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < order; i++)
+    for (int i = 0; i < order; i++)
        dims[i] = prob.GetDim(i);
-    
-    InitTensor(&score, &prob);
-    InitTensor(&probPath, &prob);

-    prob.Reshape(prob.unitNum/outputSize, outputSize);
-    score.Reshape(score.unitNum/outputSize, outputSize);
+    InitTensorV2(&score, &prob);
+    InitTensorV2(&probPath, &prob);
+
+    prob.Reshape(prob.unitNum / outputSize, outputSize);
+    score.Reshape(score.unitNum / outputSize, outputSize);
    probPath.Reshape(score.unitNum / outputSize, outputSize);
    probPathPrev.Reshape(probPathPrev.unitNum);

    /* the log-scale probability of the entire sequence */
    _SumDim(&prob, &probPathPrev, &probPath, 0);
-
-
-    InitTensor(&len, &lenPrev);
-    InitTensor(&lp, &lenPrev);
+    InitTensorV2(&len, &lenPrev);
+    InitTensorV2(&lp, &lenPrev);

    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);

    /* the GNMT-like length penalty */
-    //lp = T2TLengthPenalizer::GNMT(len, alpha);
+    lp = T2TLengthPenalizer::GNMT(len, alpha);

-    //lp.Reshape(lp.unitNum);
+    lp.Reshape(lp.unitNum);

    /* score = log-prob/lp */
-    //_DivDim(&probPath, &lp, &score, 0);
+    _DivDim(&probPath, &lp, &score, 0);

    if (prev->isStart) {
-        XTensor firstMask = MakeFirstMask(beam);
+        XTensor firstMask;
+        firstMask = MakeFirstMask(beam);
        firstMask.Reshape(firstMask.unitNum);

        /* mask the hypotheses in the beam except the first one */
        _SumDim(&score, &firstMask, &score, 0);
    }

-    InitTensor(&mask, 
-               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 
-               prev->endMark.devID);
-	mask.SetZeroAll();
+    InitTensorV2(&mask, prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F, prev->endMark.devID);
+    mask.SetZeroAll();
    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);
-    
+
    mask.Reshape(mask.unitNum);

-    /* mask the completed hypotheses so that they cannot 
+    /* mask the completed hypotheses so that they cannot
       be involved in further sorting and beam search. */
    _SumDim(&score, &mask, &score, 0);
    
+
    prob.Reshape(order, dims);
    score.Reshape(order, dims);
    probPath.Reshape(order, dims);
    probPathPrev.Reshape(order - 1, dims);
    lp.Reshape(order - 1, dims);
-    mask.Reshape(order -1 , dims);
+    mask.Reshape(order - 1, dims);
 }

-/* 
+/*
 generate tokens for the next state via beam pruning
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Generate(T2TStateBundle * beam)
+void T2TSearch::Generate(T2TStateBundle* beam)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int dimsBeam[MAX_TENSOR_DIM_NUM];
    int dimsTopK[MAX_TENSOR_DIM_NUM];
-    
+
    XTensor scoreTopK;
-    XTensor &score = beam->modelScore;
-    XTensor &index = beam->prediction;
-    XTensor &preID = beam->preID;
-    XTensor &probPath = beam->probPath;
-    XTensor &prob = beam->prob;
+    XTensor& score = beam->modelScore;
+    XTensor& index = beam->prediction;
+    XTensor& preID = beam->preID;
+    XTensor& probPath = beam->probPath;
+    XTensor& prob = beam->prob;
    int order = score.order;

    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
    CheckNTErrors(dimsBeam[order - 3] % beamSize == 0, "Wrong dimension size!");
-    
+
    for (int i = 0; i < order; i++) {
        dims[i] = score.GetDim(i);
        dimsBeam[i] = score.GetDim(i);
@@ -277,15 +275,15 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    dimsBeam[order - 1] *= beamSize;
    dimsTopK[order - 3] = dimsBeam[order - 3];
    dimsTopK[order - 1] = beamSize;
-    
-    InitTensor(&scoreTopK, order, dimsTopK, score.dataType, score.devID);
-    InitTensor(&index, order, dimsTopK, X_INT, score.devID);
-    InitTensor(&preID, order, dimsTopK, X_INT, -1);
-    
+
+    InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType, 1.0F, score.devID);
+    InitTensorV2(&index, order, dimsTopK, X_INT, 1.0F, score.devID);
+    InitTensorV2(&preID, order, dimsTopK, X_INT, 1.0F, -1);
+
    /* mask the first and the padding id */
    int dimMask[]{ score.GetDim(-1) };
    XTensor mask;
-    InitTensor(&mask, 1, dimMask, X_FLOAT, -1);
+    InitTensorV2(&mask, 1, dimMask, X_FLOAT, 1.0F, -1);
    mask.SetZeroAll();
    mask.Set1D(-1e20F, 0);
    mask.Set1D(-1e20F, 1);
@@ -293,21 +291,21 @@ void T2TSearch::Generate(T2TStateBundle * beam)

    //_SumDim(&score, &mask, 2);
    score.Reshape(order, dimsBeam);
-    
+
    /* keep the most promissing candidates in the beam */
    /* TODO: check this line */
    TopK(score, scoreTopK, index, -1, beamSize);

    CopyValues(index, preID);
-    
+
    /* "preID" represents the id (or the offset) of the previous state used to make the current
       hypothesis. Note that we reshape the "score" tensor into a matrix where each
       row means a previous state. The column number is size-of-beam \times vocab-size. We,
       therefore, divide entries of the top-k index by vocab-size to compute the id of the
       previous state for each hypothesis in the top-k list. */
    DescaleMe(preID, sizeVocab);
-    
-    /* Then, we do something similar to "preID". For the top-k predictions, we need 
+
+    /* Then, we do something similar to "preID". For the top-k predictions, we need
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
    ModMe(index, sizeVocab);
@@ -315,7 +313,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    score.Reshape(order, dims);

    /* we keep the top-k scores */
-    InitTensor(&score, &scoreTopK);
+    InitTensorV2(&score, &scoreTopK);
    CopyValues(scoreTopK, score);

    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
@@ -334,9 +332,9 @@ void T2TSearch::Generate(T2TStateBundle * beam)

    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
-    InitTensor(&probPathTopK, &scoreTopK);
+    InitTensorV2(&probPathTopK, &scoreTopK);
    XTensor probTopK;
-    InitTensor(&probTopK, &scoreTopK);
+    InitTensorV2(&probTopK, &scoreTopK);

    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
@@ -362,38 +360,38 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    prob = probTopK;
 }

-/* 
-expand the search graph 
+/*
+expand the search graph
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
+void T2TSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam)
 {
    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, "A problem occurs in the beam!");
-    
+
    beam->MakeStates(beam->prediction.unitNum);

-    T2TState * states = beam->states;
-    XTensor & idRef = beam->preID;
-    XTensor & modelScoreRef = beam->modelScore;
-    XTensor & probRef = beam->prob;
-    XTensor & probPathRef = beam->probPath;
-    XTensor & predictionRef = beam->prediction;
-    XTensor & endMark = beam->endMark;
+    T2TState* states = beam->states;
+    XTensor& idRef = beam->preID;
+    XTensor& modelScoreRef = beam->modelScore;
+    XTensor& probRef = beam->prob;
+    XTensor& probPathRef = beam->probPath;
+    XTensor& predictionRef = beam->prediction;
+    XTensor& endMark = beam->endMark;
    XTensor   id;
    XTensor   modelScore;
    XTensor   prob;
    XTensor   probPath;
    XTensor   prediction;
    XTensor   endMarkCPU;
-    
+
    InitTensorOnCPU(&id, &idRef);
    InitTensorOnCPU(&modelScore, &modelScoreRef);
    InitTensorOnCPU(&prob, &probRef);
    InitTensorOnCPU(&probPath, &probPathRef);
    InitTensorOnCPU(&prediction, &predictionRef);
    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensor(&endMark, &predictionRef);
-    
+    InitTensorV2(&endMark, &predictionRef);
+
    /* we copy the data to CPU because the frequent access to GPU is slow
       and we can speed-up the process by doing the job on CPU. */
    CopyValues(idRef, id);
@@ -403,19 +401,19 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
    CopyValues(predictionRef, prediction);

    CheckNTErrors(beam->stateNum == id.unitNum, "Errors occur in counting!");
-    
-    /* Related variables are kept on the states of the graph. All these are 
-       maintained on CPUs to ease the implementation of frequent access and 
-       modification of the states. An alternative is to do this on GPUs but 
+
+    /* Related variables are kept on the states of the graph. All these are
+       maintained on CPUs to ease the implementation of frequent access and
+       modification of the states. An alternative is to do this on GPUs but
       it needs much more coding work and the speed-up is not obvious. */
-    for(int i = 0; i < beam->stateNum; i += beamSize){
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
-            T2TState & state = states[k];
+            T2TState& state = states[k];

            int offset = id.GetInt(k);
            int pid = i / beamSize;
-            T2TState * last = prev->states + pid * beamSize + offset;
+            T2TState* last = prev->states + pid * beamSize + offset;

            CheckNTErrors(offset >= 0, "Wrong state index!");

@@ -457,48 +455,48 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
    CopyValues(endMarkCPU, endMark);
 }

-/* 
+/*
 collect hypotheses with ending symbols. Given a beam of hypotheses,
 we remove the finished hypotheses and keep them in a heap.
 >> beam  - the beam that keeps a number of states
 */
-void T2TSearch::Collect(T2TStateBundle * beam)
+void T2TSearch::Collect(T2TStateBundle* beam)
 {
-    T2TState * states = beam->states;
+    T2TState* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
-        
-        CheckNTErrors(state.pid >= 0 && state.pid < batchSize, 
-                      "Invalid sample id!");
-        
+        T2TState& state = states[i];
+
+        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
+            "Invalid sample id!");
+
        /* check if this is the first end symbol. It is false
           if there have been end symbols in previously generated words. */
        bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);

        /* we push the hypothesis into the heap when it is completed */
-        if(state.isEnd != 0)
+        if (state.isEnd != 0)
            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
    }
 }

-/* 
-fill the hypotheis heap with incomplete hypotheses 
+/*
+fill the hypotheis heap with incomplete hypotheses
 >> beam  - the beam that keeps a number of states (final)
 */
-void T2TSearch::FillHeap(T2TStateBundle * beam)
+void T2TSearch::FillHeap(T2TStateBundle* beam)
 {
-    bool * emptyFlags = new bool[batchSize];
+    bool* emptyFlags = new bool[batchSize];
    for (int i = 0; i < batchSize; i++)
        emptyFlags[i] = (fullHypos[i].Count() == 0);

-    T2TState * states = beam->states;
+    T2TState* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
+        T2TState& state = states[i];

        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
-                      "Invalid sample id!");
+            "Invalid sample id!");

        /* we push the imcomplete hypothesis into the heap */
        if (emptyFlags[state.pid] && state.isEnd == 0)
@@ -508,32 +506,32 @@ void T2TSearch::FillHeap(T2TStateBundle * beam)
    delete[] emptyFlags;
 }

-/* 
-save the output sequences in a tensor 
+/*
+save the output sequences in a tensor
 >> output - output sequences (for return)
 */
-void T2TSearch::Dump(XTensor * output)
+void T2TSearch::Dump(XTensor* output)
 {
-    int dims[3] = {batchSize, beamSize, maxLength};
-    int * words = new int[maxLength];
+    int dims[3] = { batchSize, beamSize, maxLength };
+    int* words = new int[maxLength];

-    InitTensor(output, 3, dims, X_INT);
+    InitTensorV2(output, 3, dims, X_INT);
    SetDataFixedInt(*output, -1);

    /* heap for an input sentence in the batch */
-    for(int h = 0; h < batchSize; h++){
+    for (int h = 0; h < batchSize; h++) {

-        XHeap<MIN_HEAP, float> &heap = fullHypos[h];
+        XHeap<MIN_HEAP, float>& heap = fullHypos[h];

        /* for each output in the beam */
-        for(int i = 0; i < beamSize && heap.Count() > 0; i++){
-            T2TState * state = (T2TState *)heap.Pop().index;
-            
+        for (int i = 0; i < beamSize && heap.Count() > 0; i++) {
+            T2TState* state = (T2TState*)heap.Pop().index;
+
            int count = 0;
            bool isCompleted = true;

            /* we track the state from the end to the beginning */
-            while(state != NULL){
+            while (state != NULL) {
                if (!state->isCompleted)
                    isCompleted = false;
                if (isCompleted)
@@ -544,7 +542,7 @@ void T2TSearch::Dump(XTensor * output)
            }

            /* dump the sentence to the output tensor */
-            for(int w = 0; w < count; w++)
+            for (int w = 0; w < count; w++)
                output->Set3DInt(words[count - w - 1], h, beamSize - i - 1, w);
        }
    }
@@ -552,38 +550,38 @@ void T2TSearch::Dump(XTensor * output)
    delete[] words;
 }

-/* 
-check if the token is an end symbol 
+/*
+check if the token is an end symbol
 >> token - token to be checked
 */
 bool T2TSearch::IsEnd(int token)
 {
    CheckNTErrors(endSymbolNum > 0, "No end symbol?");

-    for(int i = 0; i < endSymbolNum; i++){
-        if(endSymbols[i] == token)
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
            return true;
    }

    return false;
 }

-/* 
+/*
 set end symbols for search
 >> tokens - end symbols
 >> tokenNum - number of the end symbols
 */
-void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
+void T2TSearch::SetEnd(const int* tokens, const int tokenNum)
 {
-    if(endSymbols != NULL)
+    if (endSymbols != NULL)
        delete[] endSymbols;

-    if(tokenNum <= 0)
+    if (tokenNum <= 0)
        return;

    /* we may have multiple end symbols */
    tokens = new int[tokenNum];
-    for(int i = 0; i < tokenNum; i++)
+    for (int i = 0; i < tokenNum; i++)
        endSymbols[i] = tokens[i];
    endSymbolNum = tokenNum;
 }
@@ -592,9 +590,9 @@ void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
 make a mask to prevent duplicated entries in beam expansion for the first position
 >> beam - the beam that keeps the searching states
 */
-XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
+XTensor T2TSearch::MakeFirstMask(T2TStateBundle* beam)
 {
-    XTensor &prob = beam->prob;
+    XTensor& prob = beam->prob;
    XTensor mask;

    int order = prob.order;
@@ -602,7 +600,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    for (int i = 0; i < order - 1; i++)
        dims[i] = prob.GetDim(i);

-    InitTensor(&mask, order - 1, dims, X_FLOAT);
+    InitTensorV2(&mask, order - 1, dims, X_FLOAT);
    mask.SetZeroAll();

    for (int i = 0; i < mask.unitNum; i++) {

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  */

 #ifndef __T2TSEARCH_H__
 #define __T2TSEARCH_H__
@@ -40,10 +40,10 @@ private:

    /* predictor */
    T2TPredictor predictor;
-    
+
    /* max length of the generated sequence */
    int maxLength;
-    
+
    /* beam size */
    int beamSize;

@@ -51,10 +51,10 @@ private:
    int batchSize;

    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float> * fullHypos;
+    XHeap<MIN_HEAP, float>* fullHypos;

    /* array of the end symbols */
-    int * endSymbols;
+    int* endSymbols;

    /* number of the end symbols */
    int endSymbolNum;
@@ -68,42 +68,42 @@ public:

    /* de-constructor */
    ~T2TSearch();
-    
+
    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output);

    /* preparation */
-    void Prepare(int myBatchSize,int myBeamSize);
+    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle * beam);
+    void Generate(T2TStateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle * beam);
+    void Collect(T2TStateBundle* beam);

    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle * beam);
+    void FillHeap(T2TStateBundle* beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output);
+    void Dump(XTensor* output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);

    /* set end symbols for search */
-    void SetEnd(const int * tokens, const int tokenNum);
+    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle * beam);
+    XTensor MakeFirstMask(T2TStateBundle* beam);
 };

 }

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  */

 #include <math.h>
 #include "T2TUtility.h"
@@ -44,23 +44,23 @@ T2TTester::~T2TTester()
 }

 /* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
+void T2TTester::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
    LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
-    
+
    seacher.Init(argc, argv);
 }

-/* 
+/*
 test the model
 >> fn - test data file
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
 {
    int wc = 0;
    int wordCount = 0;
@@ -86,7 +86,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    int* seqs = new int[MILLION];

    batchLoader.Init(fn);
-    
+

    int count = 0;
    while (!batchLoader.IsEmpty())
@@ -94,23 +94,23 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        count++;
        wordCount = 0;
        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfCache[i].miss = true;
-            model->decoder->contextCache[i].miss = true;
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
        }

        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
-        
+
        XTensor output;

        seacher.Search(model, &batchEnc, &paddingEnc, &output);
-
+        output.Dump(stderr);
        for (int i = 0; i < indices.size(); ++i) {
            Result res;
            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1D(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{i};
+            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
+            int idx[]{ i };
            srcIdx.SetData(idx, 1);
-            InitTensor(&tgtIdx, &srcIdx);
+            InitTensorV2(&tgtIdx, &srcIdx);
            SetAscendingOrder(tgtIdx, 0);

            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
@@ -127,9 +127,9 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)

        if (batchCount % 1 == 0) {
            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
-                    elapsed, sentCount, wordCount);
+            XPRINT3(0, stderr,
+                "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
+                elapsed, sentCount, wordCount);
        }
    }

@@ -138,11 +138,11 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    for (auto res : batchLoader.resBuffer) {
        Dump(ofile, &res.values);
    }
-        
+
    fclose(ofile);

    delete[] seqs;
-    
+
    double elapsed = GetClockSec() - startT;

    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
@@ -153,7 +153,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTester::Dump(FILE * file, XTensor * output)
+void T2TTester::Dump(FILE* file, XTensor* output)
 {
    int seqLength = output->GetDim(-1);


--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -15,10 +15,10 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- * A week with no trips :)
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  * A week with no trips :)
+  */

 #ifndef __T2TTESTER_H__
 #define __T2TTESTER_H__
@@ -41,7 +41,7 @@ public:

    /* batch size for sentences */
    int sentBatch;
-    
+
    /* for batching */
    DataSet batchLoader;

@@ -56,13 +56,13 @@ public:
    ~T2TTester();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* test the model */
-    void Test(const char * fn, const char * ofn, T2TModel * model);
+    void Test(const char* fn, const char* ofn, T2TModel* model);

    /* dump the result into the file */
-    void Dump(FILE * file, XTensor * output);
+    void Dump(FILE* file, XTensor* output);
 };

 }

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
@@ -38,7 +38,7 @@ namespace transformer
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int TransformerMain(int argc, const char** argv);

 }


--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -28,6 +28,7 @@
 #include "XList.h"
 #include "XGlobal.h"

+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

@@ -363,6 +364,8 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
+template struct TensorListBase<uint64_t>;
 template struct TensorListBase<void*>;

+
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -26,6 +26,8 @@
 #include "XMem.h"
 #include "XGlobal.h"

+#include <cstdint>
+
 #ifndef __TensorList_H__
 #define __TensorList_H__

@@ -118,7 +120,14 @@ public:
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    T& operator[] (int i) { return GetItem(i); };
+    T& operator[] (int i) { 
+        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+        if (i < 0)
+            return items[count + i];
+        else
+            return items[i];
+    };
    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
 };
@@ -132,7 +141,7 @@ typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
-
+typedef TensorListBase<uint64_t> UInt64List;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,16 +15,16 @@
 * limitations under the License.
 */

-/*
- * 
- * implementation of tensors used in this work. It it is the basis of XMatrix 
- * and XVector
- *
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-07-31
- * $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
- *
- */
+ /*
+  *
+  * implementation of tensors used in this work. It it is the basis of XMatrix
+  * and XVector
+  *
+  *
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-07-31
+  * $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
+  *
+  */

 #include <stdio.h>
 #include <stdlib.h>
@@ -53,7 +53,7 @@

 #ifdef USE_CUDA

-// the CUDA stuff
+  // the CUDA stuff
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -64,7 +64,7 @@
 #endif

 /* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+namespace nts {

 int tensorIDGlobal = 0;
 MUTEX_HANDLE tensorMutex;
@@ -73,11 +73,11 @@ XTensor NULLTensor;
 /* generate a tensor id */
 int MakeTensorID()
 {
-    if(tensorIDGlobal == 0)
+    if (tensorIDGlobal == 0)
        MUTEX_INIT(tensorMutex);

    MUTEX_LOCK(tensorMutex);
-    int id = tensorIDGlobal++;    
+    int id = tensorIDGlobal++;
    MUTEX_UNLOCK(tensorMutex);

    return id;
@@ -91,13 +91,13 @@ XTensor::XTensor()

    id = MakeTensorID();
    isDefaultDType = true;
-    isInGlobalMem  = false;
+    isInGlobalMem = false;
    isInit = false;
-    isTmp =  false;
+    isTmp = false;
 }

 /* constructor */
-XTensor::XTensor(const XTensor * reference)
+XTensor::XTensor(const XTensor* reference)
 {
    Init();
    SetDataPointer();
@@ -106,13 +106,13 @@ XTensor::XTensor(const XTensor * reference)
    InitTensorV2(this, reference);
 }

-/* 
-constructor 
+/*
+constructor
 >> myOrder - order of the tensor
 >> myDevID - device id
 >> myMem - memory pool used to allocating the data array
 */
-XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
+XTensor::XTensor(const int myOrder, int myDevID, XMem* myMem)
 {
    CheckNTErrors((myOrder >= 0), "Illegal tensor order1");

@@ -125,8 +125,8 @@ XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
    devID = myMem == NULL ? myDevID : myMem->devID;
 }

-/* 
-constructor 
+/*
+constructor
 >> myOrder - order of the tensor
 >> myDimSize - size of each dimension
 >> myDataType - unit size (e.g., int, float, and double)
@@ -134,8 +134,8 @@ constructor
 >> myDevID - device id
 >> myMem - memory pool used to allocating the data array
 */
-XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
-                 const float myDenseRatio, int myDevID, XMem * myMem)
+XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE myDataType,
+    const float myDenseRatio, int myDevID, XMem* myMem)
 {
    Init();
    SetDataPointer();
@@ -145,12 +145,12 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
    mem = myMem;
    devID = myMem != NULL ? myMem->devID : myDevID;

-    if(order >= 0)
+    if (order >= 0)
        Resize(myOrder, myDimSize, myDataType, myDenseRatio);
 }

 /* copy constructor */
-XTensor::XTensor(const XTensor &reference)
+XTensor::XTensor(const XTensor& reference)
 {
    Init();
    SetDataPointer();
@@ -158,13 +158,13 @@ XTensor::XTensor(const XTensor &reference)
    ShallowCopy(reference);
    data = NULL;
    dataHost = NULL;
-    
-    if(reference.isTmp){
+
+    if (reference.isTmp) {
        devID = reference.devID;
        mem = reference.mem;
        data = reference.data;
        signature = reference.signature;
-        
+
        /* what we really want to do is "reference.data = NULL;"
           As "reference" is constant, we cannot reset "reference.data"
           here. So we save the ADDRESS of "reference.data" in
@@ -172,26 +172,26 @@ XTensor::XTensor(const XTensor &reference)
           This is VERY tricky and there might be better solutions :) */
        *reference.dataP = NULL;
    }
-    else{
+    else {
        devID = reference.devID;
        mem = reference.mem;
        InitTensorV2(this, &reference);
        _CopyValues(&reference, this);
    }

-    if(reference.isTmp)
+    if (reference.isTmp)
        XLink::Replace(&reference, this);
-    else{
+    else {
        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
        XLink::CopyIncoming(&reference, this);
    }

    isInit = true;
-    isTmp  = reference.isTmp;
+    isTmp = reference.isTmp;
 }

 /* copy constructor (with right value reference) */
-XTensor::XTensor(const XTensor &&reference)
+XTensor::XTensor(const XTensor&& reference)
 {
    Init();
    SetDataPointer();
@@ -199,12 +199,12 @@ XTensor::XTensor(const XTensor &&reference)
    ShallowCopy(reference);
    data = NULL;
    dataHost = NULL;
-    
+
    devID = reference.devID;
    mem = reference.mem;
    data = reference.data;
    signature = reference.signature;
-        
+
    /* what we really want to do is "reference.data = NULL;"
       As "reference" is constant, we cannot reset "reference.data"
       here. So we save the ADDRESS of "reference.data" in
@@ -215,7 +215,7 @@ XTensor::XTensor(const XTensor &&reference)
    XLink::Replace(&reference, this);

    isInit = true;
-    isTmp  = reference.isTmp;
+    isTmp = reference.isTmp;
 }

 /* de-constructor */
@@ -225,30 +225,30 @@ XTensor::~XTensor()
       the connectivity of the graph. To kill memory
       leak, we release the data of the new tensor
       when its parent is deleted (see ClearIncoming). */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
-        
-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        data = NULL;
-        
+
        XLink::Replace(this, newTensor);
    }
-    
+
    XLink::ClearOutgoing(this);
    XLink::ClearIncoming(this);
-    
+
    DestroyData();

-    if(grad != NULL)
+    if (grad != NULL)
        delete grad;
 }

 /* set the name of the tensor */
-void XTensor::SetName(const char * myName)
+void XTensor::SetName(const char* myName)
 {
    strcpy(name, myName);
 }
@@ -277,10 +277,10 @@ void XTensor::Init()
    isInGlobalMem = false;
    memset(isAllValued, 0, sizeof(bool) * MAX_TENSOR_DIM_NUM);
    isInit = false;
-    isTmp =  false;
+    isTmp = false;
    isGrad = false;
-    isVar  = false;
-    enableGrad = true;
+    isVar = false;
+    enableGrad = X_ENABLE_GRAD;
    visitMark = 0;
    grad = NULL;
 }
@@ -288,26 +288,26 @@ void XTensor::Init()
 /* delete data arrays */
 void XTensor::DestroyData()
 {
-    if(data != NULL && mem == NULL && !isShared)
+    if (data != NULL && mem == NULL && !isShared)
        XMemFree(devID, data);
-    else if(data != NULL && isInGlobalMem)
+    else if (data != NULL && isInGlobalMem)
        FreeData(this, mem);
-    else if(data != NULL)
+    else if (data != NULL)
        mem->Release(data, GetDataSizeInChar(), signature);
-    
+
    data = NULL;

-    if(dataHost != NULL)
-        delete[] (char*)dataHost;
+    if (dataHost != NULL)
+        delete[](char*)dataHost;
    dataHost = NULL;
 }

-/* 
+/*
 shallow copy of the tensor
 Note that we do not copy data array here
 >> tensor - the source tensor
 */
-void XTensor::ShallowCopy(const XTensor &tensor)
+void XTensor::ShallowCopy(const XTensor& tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
@@ -318,7 +318,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
    unitNum = tensor.unitNum;
    isSparse = tensor.isSparse;
    unitNumNonZero = tensor.unitNumNonZero;
-    denseRatio =  tensor.denseRatio;
+    denseRatio = tensor.denseRatio;
    isShared = tensor.isShared;
    isDefaultDType = tensor.isDefaultDType;
    isInGlobalMem = tensor.isInGlobalMem;
@@ -330,17 +330,17 @@ XTensor& XTensor::operator= (const XTensor& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
-        
-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
        newTensor->signature = tensor.signature;
-        
+
        XLink::Replace(this, newTensor);
        XLink::ClearOutgoing(this);
        XLink::ClearIncoming(this);
@@ -350,35 +350,35 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        dataHost = NULL;
    }

-    if(false && !tensor.isTmp){
+    if (false && !tensor.isTmp) {
        /* NOTE: this might lead to additional data copy by Mac LLVM compilers */
        /* we make an identity transformation here */
-        
-        if(outgo.tailNum > 0)
+
+        if (outgo.tailNum > 0)
            XLink::ClearOutgoing(this);
        XLink::ClearIncoming(this);
-        
-        if(!_IsSameShaped(this, &tensor))
+
+        if (!_IsSameShaped(this, &tensor))
            Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
-        
+
        _Identity(&tensor, this);
        XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY);
    }
-    else{
+    else {
        /* hard copy of the data array */
        int size = unitNum * unitSize;
-        if( isInit && !isSparse && !tensor.isSparse &&
+        if (isInit && !isSparse && !tensor.isSparse &&
            size == tensor.unitNum * tensor.unitSize &&
-          ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
+            ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
            data != NULL)
        {
            XMemCopy(data, devID, tensor.data, tensor.devID, size);
-            if(dataHost != NULL && tensor.dataHost != NULL)
+            if (dataHost != NULL && tensor.dataHost != NULL)
                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
        }
-        else{
+        else {
            DestroyData();
-            if(!isInit){
+            if (!isInit) {
                devID = tensor.devID;
                mem = tensor.mem;
            }
@@ -391,7 +391,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        ShallowCopy(tensor);

        isInit = true;
-        isTmp  = false;
+        isTmp = false;

        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");

@@ -407,17 +407,17 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
-        
-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
        newTensor->signature = tensor.signature;
-        
+
        XLink::Replace(this, newTensor);
        XLink::ClearOutgoing(this);
        XLink::ClearIncoming(this);
@@ -426,17 +426,17 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
        data = NULL;
        dataHost = NULL;
    }
-    
+
    DestroyData();

    ShallowCopy(tensor);
-    
+
    isInit = true;
    devID = tensor.devID;
-    mem  = tensor.mem;
+    mem = tensor.mem;
    data = tensor.data;
    signature = tensor.signature;
-        
+
    /* what we really want to do is "reference.data = NULL;"
       As "reference" is constant, we cannot reset "reference.data"
       here. So we save the ADDRESS of "reference.data" in
@@ -456,7 +456,7 @@ XTensor XTensor::operator+ (const XTensor& tensor) const
 }

 /* overloading of the plus-sign */
-XTensor XTensor::operator+ (const DTYPE shift) const 
+XTensor XTensor::operator+ (const DTYPE shift) const
 {
    return ScaleAndShift(*this, 1, shift);
 }
@@ -500,10 +500,10 @@ XTensor XTensor::operator/ (const XTensor& tensor) const
 /* overloading of the division-sign */
 XTensor XTensor::operator/ (const DTYPE scale) const
 {
-    return ScaleAndShift(*this, (DTYPE)1/scale, 0);
+    return ScaleAndShift(*this, (DTYPE)1 / scale, 0);
 }

-/* 
+/*
 linear transformation b = a * \scale + \shift
 >> scale - the slope
 >> shift - the intercept
@@ -513,12 +513,12 @@ XTensor XTensor::Lin(DTYPE scale, DTYPE shift) const
    return Linear(*this, scale, shift);
 }

-/* 
-relocate the data on the target device 
+/*
+relocate the data on the target device
 >> myDevId - target device id
 >> myMem - memory pool on the target device
 */
-void XTensor::SetDevice(int myDevId, XMem * myMem)
+void XTensor::SetDevice(int myDevId, XMem* myMem)
 {
    if (myMem == NULL) {
        myMem = GMems.GetMem(myDevId);
@@ -527,9 +527,9 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
    isInGlobalMem = false;
 }

-bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
+bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
 {
-    if(a == NULL || b == NULL)
+    if (a == NULL || b == NULL)
        return false;

    if ((a->order - 1) != b->order)
@@ -541,61 +541,61 @@ bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
                return false;
        }
        else if (i >= dim) {
-            if (a->dimSize[i+1] != b->dimSize[i])
+            if (a->dimSize[i + 1] != b->dimSize[i])
                return false;
        }
    }

-    if(a->dataType != b->dataType)
+    if (a->dataType != b->dataType)
        return false;

-    if(a->denseRatio != b->denseRatio)
+    if (a->denseRatio != b->denseRatio)
        return false;

-    if(a->isSparse != b->isSparse)
+    if (a->isSparse != b->isSparse)
        return false;

    return true;
 }

-/* 
-set the size of each dimension 
+/*
+set the size of each dimension
 >> myDimSize - size of each dimension
 */
-void XTensor::SetDim(int * myDimSize)
+void XTensor::SetDim(int* myDimSize)
 {
    for (int i = 0; i < order; i++) {
        dimSize[i] = myDimSize[i];
    }
 }

-/* 
-get the size of a given dimension 
+/*
+get the size of a given dimension
 >> dim - the given dim we are looking at
 */
 int XTensor::GetDim(const int dim) const
 {
    CheckNTErrors(dim < order, "dimenision is out of range!");
    CheckNTErrors(dim >= -order, "dimenision is out of range!");
-    
+
    int d = dim;
-    if(dim < 0)
+    if (dim < 0)
        d = order + dim;

    return dimSize[d];
 }

-/* 
-reshape the tensor 
+/*
+reshape the tensor
 >> myOrder - order of the tensor
 >> myDimSize - size of each dimension
 */
-void XTensor::Reshape(const int myOrder, const int * myDimSize)
+void XTensor::Reshape(const int myOrder, const int* myDimSize)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int num = 1;

-    for(int i = 0; i < myOrder; i++){
+    for (int i = 0; i < myOrder; i++) {
        num *= myDimSize[i];
        dims[i] = abs(myDimSize[i]);
    }
@@ -606,7 +606,7 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize)
    memcpy(dimSize, dims, sizeof(int) * order);
 }

-/* 
+/*
 reshape the tensor into a vector
 >> num - number of elements
 */
@@ -616,14 +616,14 @@ void XTensor::Reshape(const int num)
    Reshape(1, &dim);
 }

-/* 
+/*
 reshape the tensor into a matrix
 >> rowNum - number of rows
 >> colNum - number of columns
 */
 void XTensor::Reshape(const int rowNum, const int colNum)
 {
-    int dims[2] = {rowNum, colNum};
+    int dims[2] = { rowNum, colNum };
    Reshape(2, dims);
 }

@@ -663,7 +663,7 @@ XTensor XTensor::TypeAs(const XTensor input)
 /* get the number of items in the data array */
 int XTensor::GetSize() const
 {
-    if(isSparse)
+    if (isSparse)
        return unitNumNonZero;
    else
        return unitNum;
@@ -672,39 +672,39 @@ int XTensor::GetSize() const
 /* get the size of the memory space used */
 int XTensor::GetDataSizeInChar() const
 {
-    if(isSparse){
+    if (isSparse) {
        int num = int(unitNum * denseRatio + 1);
-        int tupleSize = sizeof(int)+sizeof(DTYPE);
-        int size = sizeof(int) + tupleSize*(num);
+        int tupleSize = sizeof(int) + sizeof(DTYPE);
+        int size = sizeof(int) + tupleSize * (num);
        return size;
    }
-    else{
+    else {
        return unitNum * unitSize;
    }
 }

-/* 
-get unit size in terms of "dataType" 
+/*
+get unit size in terms of "dataType"
 >> myDataType - type of unit
 << return - unit size
 */
 int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const
 {
-    if(myDataType == X_INT)
+    if (myDataType == X_INT)
        return sizeof(int);
-    else if(myDataType == X_FLOAT)
+    else if (myDataType == X_FLOAT)
        return sizeof(float);
-    else if(myDataType == X_DOUBLE)
+    else if (myDataType == X_DOUBLE)
        return sizeof(double);
-    else if(myDataType == X_INT8)
+    else if (myDataType == X_INT8)
        return 1;
-    else if(myDataType == X_FLOAT16)
+    else if (myDataType == X_FLOAT16)
        return 2;
    return sizeof(float);
 }

-/* 
-get offset (2D) 
+/*
+get offset (2D)
 >> row - index of demension 0
 >> col - index of demension 1
 */
@@ -717,8 +717,8 @@ MTYPE XTensor::GetOffset2D(int row, int col) const
    return row * dimSize[1] + col;
 }

-/* 
-get offset (3D) 
+/*
+get offset (3D)
 >> d0 - index of demension 0
 >> d1 - index of demension 1
 >> d2 - index of demension 2
@@ -733,49 +733,49 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
    return (d0 * dimSize[1] + d1) * dimSize[2] + d2;
 }

-/* 
-a vector with all entries of 0 
+/*
+a vector with all entries of 0
 >> stream - stream for the job pipeline
 */
-void XTensor::SetZeroAll(XStream * stream)
+void XTensor::SetZeroAll(XStream* stream)
 {
-    if(data == NULL)
+    if (data == NULL)
        return;

-    if(isSparse){
-        if(devID >= 0){
+    if (isSparse) {
+        if (devID >= 0) {
 #ifdef USE_CUDA
-            int size = sizeof(int) + (sizeof(int)+sizeof(DTYPE)) * unitNumNonZero;
-            
+            int size = sizeof(int) + (sizeof(int) + sizeof(DTYPE)) * unitNumNonZero;
+
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);

-            if(stream == NULL)
+            if (stream == NULL)
                cudaMemset(data, 0, size);
            else
                cudaMemsetAsync(data, 0, size, stream->stream);
-            
+
            cudaSetDevice(devIDBackup);
 #endif
        }
        else
            *(int*)data = 0;

-        unitNumNonZero = 0; 
+        unitNumNonZero = 0;
    }
-    else{
-        if(devID >= 0){
+    else {
+        if (devID >= 0) {
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);
-            
-            if(stream == NULL)
+
+            if (stream == NULL)
                cudaMemset(data, 0, unitNum * unitSize);
            else
                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
-            
+
            cudaSetDevice(devIDBackup);
 #endif
        }
@@ -784,14 +784,14 @@ void XTensor::SetZeroAll(XStream * stream)
    }
 }

-/*  set the tensor with an data array 
+/*  set the tensor with an data array
 >> d - input data. it must be on CPU
 >> num - number of data items
 >> beg - where we start the data copy in the data array of the tensor
 */
-void XTensor::SetData(const void * d, int num, int beg)
+void XTensor::SetData(const void* d, int num, int beg)
 {
-    if (data == NULL || d ==NULL)
+    if (data == NULL || d == NULL)
        return;

    CheckNTErrors(!isSparse, "TODO");
@@ -816,7 +816,7 @@ void XTensor::Range(DTYPE lower, DTYPE upper, DTYPE step)
    _SetDataRange(this, lower, upper, step);
 }

-/* 
+/*
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
@@ -830,7 +830,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)

    // srand((unsigned)time(0));
    DTYPE variance = upper - lower;
-    void * d = NULL;
+    void* d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
@@ -849,12 +849,12 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    }

    SetData(d, unitNum);
-    
+
    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
+        delete[](float*)d;
    }
    else {
-        delete[] (double*)d;
+        delete[](double*)d;
    }
 }

@@ -868,12 +868,12 @@ double GaussRand(DTYPE mean, DTYPE standardDeviation)
    double z;
    double pi = 3.141592654;

-    if (phase == 0){
+    if (phase == 0) {
        u = (rand() + 1.0) / (RAND_MAX + 1.0);
        v = (rand() + 1.0) / (RAND_MAX + 1.0);
-        z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
+        z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
    }
-    else{
+    else {
        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
    }

@@ -881,7 +881,7 @@ double GaussRand(DTYPE mean, DTYPE standardDeviation)
    return mean + (z * standardDeviation);
 }

-/* 
+/*
 set the tensor items by a normal distribution
 >> mean - mean or expectation of the distribution
 >> standardDeviation - standard deviation of the distribution
@@ -894,7 +894,7 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
        return;

    // srand((unsigned)time(0));
-    void * d = NULL;
+    void* d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
@@ -914,31 +914,31 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    SetData(d, unitNum);

    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
+        delete[](float*)d;
    }
    else {
-        delete[] (double*)d;
+        delete[](double*)d;
    }
 }

-/* 
-set tensor items with an array of offsets 
+/*
+set tensor items with an array of offsets
 >> offsets - offset for each data item
 >> value - value for the data items
 >> num - number of the data items
 */
-void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
+void XTensor::SetDataBatched(MTYPE* offsets, DTYPE value, int num)
 {
    _SetDataWithOffset(this, offsets, value, num);
 }

-/* 
-set tensor items with an array of values 
+/*
+set tensor items with an array of values
 >> offsets - offset for each data item
 >> values - value for each data item
 >> num - number of the data items
 */
-void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
+void XTensor::SetDataBatchedWithValues(MTYPE* offsets, void* values, int num)
 {
    _SetDataWithOffsetAndValue(this, offsets, values, num);
 }
@@ -949,8 +949,8 @@ void XTensor::SetDataPointer()
    dataP = &data;
 }

-/* 
-get the value of a cell with the index 
+/*
+get the value of a cell with the index
 >> index - index of each dimension
 >> size - size of the index
 << return - cell value
@@ -961,7 +961,7 @@ DTYPE XTensor::Get(int index[], int size) const

    return ToCPU(devID, GetCell(index, size));
 }
-    
+
 /*
 get the value of a cell with its offset
 >> offset - offset in the array
@@ -973,37 +973,37 @@ DTYPE XTensor::Get(int offset) const
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
    CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
-    
-    DTYPE * address = (DTYPE*)data + offset;
-    
+
+    DTYPE* address = (DTYPE*)data + offset;
+
    return ToCPU(devID, address);
 }

-/* 
+/*
 get the pointer to a cell
 >> index - index of each dimension
 >> size - size of index
 << return - pointer to the cell
 */
-void * XTensor::GetCell(int index[], int size) const
+void* XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");

    int offset = index[0];
-    for(int i = 1; i < size; ++i){
+    for (int i = 1; i < size; ++i) {
        CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
        offset = offset * dimSize[i] + index[i];
    }
-    
-    if(isSparse){
+
+    if (isSparse) {
        DTYPE value;
-        void * p;
-        if(BinarySearch(offset, value, p))
+        void* p;
+        if (BinarySearch(offset, value, p))
            return (char*)p + sizeof(int);
        else
            return NULL;
    }
-    else{
+    else {
        return ((char*)data) + offset * unitSize;
    }
 }
@@ -1017,8 +1017,8 @@ DTYPE XTensor::Get0D() const
    CheckNTErrors((order == 0), "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[1] = {0};
-    void * value = GetCell(dims, 0);
+    int dims[1] = { 0 };
+    void* value = GetCell(dims, 0);

    return ToCPU(devID, value);
 }
@@ -1033,14 +1033,14 @@ DTYPE XTensor::Get1D(int i) const
    CheckNTErrors((order == 1), "Cannot get a 1d cell for a tensor whose order is not 1!");
    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");
-    
-    int dims[1] = {i};
-    void * value = GetCell(dims, 1);
-    
+
+    int dims[1] = { i };
+    void* value = GetCell(dims, 1);
+
    return ToCPU(devID, value);
 }
-    
-/* 
+
+/*
 get the value of a cell in a 2d tensor in default type
 >> ni - row index
 >> mi - column index
@@ -1053,14 +1053,14 @@ DTYPE XTensor::Get2D(int ni, int mi) const
    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
-    void * value = GetCell(dims, 2);
-    
+    int dims[2] = { ni, mi };
+    void* value = GetCell(dims, 2);
+
    return ToCPU(devID, value);
 }

-/* 
-get the value of a cell in a 3d tensor 
+/*
+get the value of a cell in a 3d tensor
 >> d0 - index of dimension 0
 >> d1 - index of dimension 1
 >> d2 - index of dimension 2
@@ -1073,12 +1073,12 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2) const
    CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
-    void * value = GetCell(dims, 3);
-    
+    int dims[3] = { d0, d1, d2 };
+    void* value = GetCell(dims, 3);
+
    return ToCPU(devID, value);
 }
-    
+
 /*
 get the int value of a cell by its offset
 >> offset - offset of the item
@@ -1089,9 +1089,9 @@ int XTensor::GetInt(int offset) const
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
    CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");
-    
-    int * address = (int*)data + offset;
-    
+
+    int* address = (int*)data + offset;
+
    return ToCPUInt(devID, address);
 }

@@ -1104,8 +1104,8 @@ int XTensor::Get0DInt() const
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");

-    int dims[1] = {0};
-    void * value = GetCell(dims, 0);
+    int dims[1] = { 0 };
+    void* value = GetCell(dims, 0);

    return ToCPUInt(devID, value);
 }
@@ -1120,33 +1120,33 @@ int XTensor::Get1DInt(int i) const
    CheckNTErrors(order == 1, "Cannot get a 1d cell for a tensor whose order is not 1!");
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");
-    
-    int dims[1] = {i};
-    void * value = GetCell(dims, 1);
-    
+
+    int dims[1] = { i };
+    void* value = GetCell(dims, 1);
+
    return ToCPUInt(devID, value);
 }
-    
-/* 
+
+/*
 get the value of a cell in a 2d tensor in int type
 >> ni - row index
 >> mi - column index
 << return - value of cell(ni, mi) in int
 */
- int XTensor::Get2DInt(int ni, int mi) const
+int XTensor::Get2DInt(int ni, int mi) const
 {
    CheckNTErrors(order == 2, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(ni >= 0 && ni < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
-    void * value = GetCell(dims, 2);
-    
+    int dims[2] = { ni, mi };
+    void* value = GetCell(dims, 2);
+
    return ToCPUInt(devID, value);
 }

-/* 
+/*
 get the value of a cell in a 3d tensor in int type
 >> d0 - index of dimension 0
 >> d1 - index of dimension 1
@@ -1161,14 +1161,14 @@ int XTensor::Get3DInt(int d0, int d1, int d2) const
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
-    void * value = GetCell(dims, 3);
-    
+    int dims[3] = { d0, d1, d2 };
+    void* value = GetCell(dims, 3);
+
    return ToCPUInt(devID, value);
 }

-/* 
-get the value of a cell in the sparse tensor 
+/*
+get the value of a cell in the sparse tensor
 >> i - i-th tuple in the tuple list of the sparse tensor
 << return - value of the tuple
 */
@@ -1177,14 +1177,14 @@ DTYPE XTensor::GetInSparse(int i) const
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    char * d = (char*)data + sizeof(int);
-    DTYPE * value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));
+    char* d = (char*)data + sizeof(int);
+    DTYPE* value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));

    return ToCPU(devID, value);
 }

-/* 
-get the key value of a tuple in a sparse tensor 
+/*
+get the key value of a tuple in a sparse tensor
 >> i - i-th tuple in the tuple list of the sparse tensor
 << return - key of the tuple
 */
@@ -1193,14 +1193,14 @@ int XTensor::GetKeyInSparse(int i) const
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    char * d = (char*)data + sizeof(int);
-    int * key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
-    
+    char* d = (char*)data + sizeof(int);
+    int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
+
    return ToCPUInt(devID, key);
 }

-/* 
-set the value of a cell 
+/*
+set the value of a cell
 >> value - value we tend to set
 >> index - index of the cell for each dimension
 >> size - size of the index
@@ -1222,7 +1222,7 @@ bool XTensor::Set(DTYPE value, int offset)
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");

-    DTYPE * d = (DTYPE*)data + offset;
+    DTYPE* d = (DTYPE*)data + offset;

    return SetToDevice(devID, d, value);
 }
@@ -1237,13 +1237,13 @@ bool XTensor::Set0D(DTYPE value)
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[1] = {0};
+    int dims[1] = { 0 };

    return SetToDevice(devID, GetCell(dims, 0), value);
 }

-/* 
-set the value of a cell in a 1d tensor 
+/*
+set the value of a cell in a 1d tensor
 >> value - value we tend to set
 >> i - item offset
 << return - succeeded or not
@@ -1254,12 +1254,12 @@ bool XTensor::Set1D(DTYPE value, int i)
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[1] = {i};
+    int dims[1] = { i };

    return SetToDevice(devID, GetCell(dims, 1), value);
 }

-/* 
+/*
 set the value of a cell in a 2d tensor in default type
 >> value - value we tend to set
 >> ni - row index
@@ -1273,12 +1273,12 @@ bool XTensor::Set2D(DTYPE value, int ni, int mi)
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
+    int dims[2] = { ni, mi };

    return SetToDevice(devID, GetCell(dims, 2), value);
 }

-/* 
+/*
 set the value of a cell in a 3d tensor in default type
 >> value - value we tend to set
 >> d0 - index of demension 0
@@ -1294,11 +1294,11 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
+    int dims[3] = { d0, d1, d2 };

    return SetToDevice(devID, GetCell(dims, 3), value);
 }
-    
+
 /*
 set the integer value of a cell by its offset
 >> value - value we tend to set to the item
@@ -1308,15 +1308,15 @@ bool XTensor::SetInt(int value, int offset)
 {
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
-    
-    int * d = (int*)data + offset;
-    
+
+    int* d = (int*)data + offset;
+
    return SetToDeviceInt(devID, d, value);
 }


-/* 
-set the integer value of a cell 
+/*
+set the integer value of a cell
 >> value - value we tend to set
 >> index - index of the cell for each dimension
 >> size - size of the index
@@ -1339,13 +1339,13 @@ bool XTensor::Set0DInt(int value)
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[1] = {0};
+    int dims[1] = { 0 };

    return SetToDeviceInt(devID, GetCell(dims, 0), value);
 }

-/* 
-set the integer value of a cell in a 1d tensor 
+/*
+set the integer value of a cell in a 1d tensor
 >> value - value we tend to set
 >> i - item offset
 << return - succeeded or not
@@ -1356,12 +1356,12 @@ bool XTensor::Set1DInt(int value, int i)
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[1] = {i};
+    int dims[1] = { i };

    return SetToDeviceInt(devID, GetCell(dims, 1), value);
 }

-/* 
+/*
 set the integer value of a cell in a 2d tensor in default type
 >> value - value we tend to set
 >> ni - row index
@@ -1375,12 +1375,12 @@ bool XTensor::Set2DInt(int value, int ni, int mi)
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[2] = {ni, mi};
+    int dims[2] = { ni, mi };

    return SetToDeviceInt(devID, GetCell(dims, 2), value);
 }

-/* 
+/*
 set the integer value of a cell in a 3d tensor in default type
 >> value - value we tend to set
 >> d0 - index of demension 0
@@ -1396,36 +1396,36 @@ bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");

-    int dims[3] = {d0, d1, d2};
+    int dims[3] = { d0, d1, d2 };

    return SetToDeviceInt(devID, GetCell(dims, 3), value);
 }

-/* 
+/*
 increase the value of a cell in a 2d tensor
 >> value - value we tend to set
 >> ni - row index
 >> mi - column index
 << return - succeeded or not
 */
- bool XTensor::Add2D(DTYPE value, int ni, int mi)
+bool XTensor::Add2D(DTYPE value, int ni, int mi)
 {
    CheckNTErrors(ni >= 0 && ni < dimSize[0], "the row index is out of range!");
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "the column index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
    CheckNTErrors(isSparse == false, "TODO!");

-    if(devID < 0){
-        DTYPE * p = (DTYPE*)data + ni * dimSize[1] + mi;
+    if (devID < 0) {
+        DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi;

-        CheckNTErrors((p != NULL), "No data array is found!");    
+        CheckNTErrors((p != NULL), "No data array is found!");

        *p = *p + value;
-    
+
        return true;
    }
-    else{
-        int dims[2] = {ni, mi};
+    else {
+        int dims[2] = { ni, mi };
        return SetToDevice(devID, GetCell(dims, 2), Get2D(ni, mi) + value);
    }
 }
@@ -1433,31 +1433,31 @@ increase the value of a cell in a 2d tensor
 /* get the number of non-zero elements (in a sparse tensor) */
 int XTensor::GetNonzeroSize() const
 {
-    if(!isSparse){
+    if (!isSparse) {
        XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n");
        CheckNTErrors(devID < 0, "TODO");
-        if(dataType == DEFAULT_DTYPE){
+        if (dataType == DEFAULT_DTYPE) {
            int count = 0;
-            for(int i = 0; i < unitNum; i++){
+            for (int i = 0; i < unitNum; i++) {
                DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
-                if(value == 0)
+                if (value == 0)
                    count++;
            }
            return count;
        }
-        else{
+        else {
            ShowNTErrors("TODO!");
            return -1;
        }
    }
-    else{
+    else {
        /* return the head of the tuple list */
        return unitNumNonZero;
    }
 }

-/* 
-set the tensor as "temporary" 
+/*
+set the tensor as "temporary"
 >> myIsTMP - the flag
 */
 void XTensor::SetTMPFlag(bool myIsTmp)
@@ -1465,8 +1465,8 @@ void XTensor::SetTMPFlag(bool myIsTmp)
    isTmp = myIsTmp;
 }

-/* 
-set the tensor as "keep-gradient" 
+/*
+set the tensor as "keep-gradient"
 >> myIsGrad - the flag
 */
 void XTensor::SetGradFlag(bool myIsGrad)
@@ -1474,18 +1474,18 @@ void XTensor::SetGradFlag(bool myIsGrad)
    isGrad = myIsGrad;
 }

-/* 
-set the tensor as "variable" 
+/*
+set the tensor as "variable"
 >> myIsVar - the flag
 */
 void XTensor::SetVarFlag(bool myIsVar)
 {
    isVar = myIsVar;
-    if(isVar)
+    if (isVar)
        SetGradFlag(true);
 }

-/* 
+/*
 resize a tensor with a specified tensor size
 >> myOrder - order of the tensor
 >> myDimSize - the size of each dimension
@@ -1493,11 +1493,11 @@ resize a tensor with a specified tensor size
 >> myDenseRatio - how often an element has non-zero value
 << return - succeeded or not
 */
-bool XTensor::Resize(const int myOrder, const int * myDimSize, 
-                     const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
+bool XTensor::Resize(const int myOrder, const int* myDimSize,
+    const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
 {
    /* free old mem */
-    if(data != NULL){
+    if (data != NULL) {
        if (mem == NULL)
            XMemFree(devID, data);
        else
@@ -1505,7 +1505,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    }

    signature = mem != NULL ? mem->GetSignature() : 0;
-    
+
    order = myOrder;
    unitNum = 1;
    unitNumNonZero = 0;
@@ -1513,11 +1513,11 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,

    bool filledData = true;
    bool zeroData = false;
-    for(int i = 0; i < order; i++){
+    for (int i = 0; i < order; i++) {
        dimSize[i] = abs(myDimSize[i]);
-        if(myDimSize[i] < 0)
+        if (myDimSize[i] < 0)
            filledData = false;
-        if(myDimSize[i] == 0)
+        if (myDimSize[i] == 0)
            zeroData = true;
        unitNum *= dimSize[i];
    }
@@ -1528,20 +1528,20 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    dataType = myDataType;
    unitSize = GetUnitSize(dataType);

-    if(myDataType != DEFAULT_DTYPE)
+    if (myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;

-    if(zeroData){
+    if (zeroData) {
        unitNum = 0;
        return false;
    }

-    if(isSparse){
+    if (isSparse) {
        /*
-        for sparse matrices, we use a list of tuple (key, value), 
-        ordered by key. Take a (2-dimensional) matrix as an example, 
+        for sparse matrices, we use a list of tuple (key, value),
+        ordered by key. Take a (2-dimensional) matrix as an example,
        we have key = m * i + j;
        The data array is
        ---------
@@ -1555,23 +1555,23 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
        (1, 0, 5)
        where the first number (2) indicates the number of elements.
        */
-        
+
        int num = int(unitNum * denseRatio + 1);
-        int tupleSize = sizeof(int)+sizeof(DTYPE);
-        int size = sizeof(int) + tupleSize*(num);
-        
-        if(filledData){
-            int * d = NULL;
+        int tupleSize = sizeof(int) + sizeof(DTYPE);
+        int size = sizeof(int) + tupleSize * (num);

-            if(mem == NULL){
+        if (filledData) {
+            int* d = NULL;
+
+            if (mem == NULL) {
                d = new int[size];
                memset(d, 0, size);
            }
-            else{
+            else {
                d = (int*)mem->Alloc(mem->devID, size);
            }

-            if(d == NULL)
+            if (d == NULL)
                return false;

 #if !defined(UNSAFE_BUT_FAST_MEM)
@@ -1581,11 +1581,11 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
        }
        return true;
    }
-    else{
-        if(filledData){
+    else {
+        if (filledData) {
            /* allocate the new one */
-            if(mem == NULL){
-                data = XMemAlloc(devID, unitNum * unitSize); 
+            if (mem == NULL) {
+                data = XMemAlloc(devID, unitNum * unitSize);
 #if defined(UNSAFE_BUT_FAST_MEM)
                XMemSet(devID, data, 0, unitNum * unitSize);
 #endif
@@ -1593,28 +1593,28 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
            else
                data = (void*)mem->Alloc(mem->devID, unitNum * unitSize);

-            if(data == NULL)
+            if (data == NULL)
                return false;
        }

 #if !defined(UNSAFE_BUT_FAST_MEM)
-        if(data != NULL)
+        if (data != NULL)
            XMem::SetZero(data, unitNum * unitSize, mem);
 #endif
        return true;
    }
 }

-/* 
+/*
 resize a tensor by another
 >> myTensor - tensor for reference
 */
-bool XTensor::Resize(const XTensor * myTensor)
+bool XTensor::Resize(const XTensor* myTensor)
 {
    denseRatio = myTensor->denseRatio;
    TENSOR_DATA_TYPE myDataType = myTensor->dataType;

-    if(myDataType != DEFAULT_DTYPE)
+    if (myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;
@@ -1622,7 +1622,7 @@ bool XTensor::Resize(const XTensor * myTensor)
    return Resize(myTensor->order, myTensor->dimSize, myDataType, denseRatio);
 }

-/* 
+/*
 binary search to find an element in a sparse tensor
 >> key - for search
 >> value - value for return
@@ -1630,54 +1630,54 @@ binary search to find an element in a sparse tensor
              it is the previous one if there is no hit
 << return - found it or not?
 */
-bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
+bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
 {
    CheckNTErrors((isSparse), "A sparse tensor is required!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");

-    int * d = (int*)data;
+    int* d = (int*)data;

-    if(key < 0 || *d == 0){
+    if (key < 0 || *d == 0) {
        value = 0;
        position = NULL;
        return false;
    }

-    int low = 0;  
-    int high = *d - 1;  
+    int low = 0;
+    int high = *d - 1;
    int last = -1;
    bool ok = false;
-    int * k = NULL;
+    int* k = NULL;
    int headSize = sizeof(int);
-    int tupleSize = sizeof(int)+sizeof(DTYPE);
-    char * p = (char*)data + headSize;
+    int tupleSize = sizeof(int) + sizeof(DTYPE);
+    char* p = (char*)data + headSize;

-    while (low <= high){  
-        int mid = low + (high-low)/2;
+    while (low <= high) {
+        int mid = low + (high - low) / 2;
        k = (int*)(p + tupleSize * mid);
-        if (*k == key){
+        if (*k == key) {
            ok = true;
-            high = mid -1;
+            high = mid - 1;
            break;
-        }  
-        else if(*k > key){
-            high = mid -1;
        }
-        else{
-            low = mid +1;
+        else if (*k > key) {
+            high = mid - 1;
+        }
+        else {
+            low = mid + 1;
            last = mid;
        }
-    }  
+    }

-    if(ok){
-        DTYPE * p = (DTYPE*)((char*)k + sizeof(int));
+    if (ok) {
+        DTYPE* p = (DTYPE*)((char*)k + sizeof(int));
        value = *p;
        position = k;
        return true;
    }
-    else{
+    else {
        value = 0;
-        if(last == -1)
+        if (last == -1)
            position = NULL;
        else
            position = (char*)data + headSize + tupleSize * last;
@@ -1685,20 +1685,20 @@ bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
    }
 }

-/* 
-dump data to a file 
+/*
+dump data to a file
 >> file - where to domp the data
 >> label - label of the tensor
 >> n - number of items to dump
 >> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
+void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, const int verbose)
 {
    if (verbose > verboseLevel)
        return;

-    void * d = data;
+    void* d = data;
    bool isNewData = false;

 #ifdef USE_CUDA
@@ -1716,7 +1716,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                num *= dimSize[i];
            num = int(num * denseRatio + 1);
            int tupleSize = sizeof(int) + sizeof(DTYPE);
-            int size = sizeof(int) + tupleSize*(num);
+            int size = sizeof(int) + tupleSize * (num);

            d = new char[size];
            memset(d, 0, size);
@@ -1730,8 +1730,8 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 

    if (label != NULL)
        fprintf(file, "%s ", label);
-    
-    if(isInit){
+
+    if (isInit) {
        fprintf(file, "order=%d dimsize=", order);
        for (int i = 0; i < order; i++) {
            fprintf(file, "%d", dimSize[i]);
@@ -1739,21 +1739,21 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                fprintf(file, ",");
        }
    }
-    else{
+    else {
        fprintf(file, "order=-1 dimsize=-1");
    }

    fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);

-    if(!isInit){
+    if (!isInit) {
        fprintf(file, "NULL");
    }
    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for(int i = beg; i < end; i++){
+            for (int i = beg; i < end; i++) {
                DTYPE f = ((DTYPE*)d)[i];
-                if(i == beg)
+                if (i == beg)
                    fprintf(file, "%e", f);
                else
                    fprintf(file, " %e", f);
@@ -1762,9 +1762,9 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        }
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for(int i = beg; i < end; i++){
+            for (int i = beg; i < end; i++) {
                int f = ((int*)d)[i];
-                if(i == beg)
+                if (i == beg)
                    fprintf(file, "%d", f);
                else
                    fprintf(file, " %d", f);
@@ -1795,7 +1795,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
    }
 }

-/* 
+/*
 dump data to a file
 >> tensor - the tensor for dumping
 >> file - where to domp the data
@@ -1804,15 +1804,15 @@ dump data to a file
 >> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
+void XTensor::Dump(const XTensor* tensor, FILE* file, const char* label, const int n, const int beg, const int verbose)
 {
    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
    _CopyValues(tensor, &a);
    a.Dump(file, label, n, beg, verbose);
 }

-/* 
-dump data to a binary file 
+/*
+dump data to a binary file
 >> file - where to dump the data
 */
 void XTensor::BinaryDump(FILE* file)
@@ -1831,12 +1831,12 @@ void XTensor::BinaryDump(FILE* file)
    }
 }

-/* 
+/*
 read data from a file
 >> file - where to load the data
 >> label - label of the tensor
 */
-void XTensor::Read(FILE * file, const char * label)
+void XTensor::Read(FILE* file, const char* label)
 {
    char typeName[32] = "";
    char dimSizeName[128] = "";
@@ -1855,12 +1855,12 @@ void XTensor::Read(FILE * file, const char * label)
    fgetc(file);

    if (fscanf(file, "order=%d dimsize=%s dtype=%s dense=%f",
-                      &dimNum, dimSizeName, typeName, &dRatio) < 4) {
+        &dimNum, dimSizeName, typeName, &dRatio) < 4) {
        ShowNTErrors("Incorrect format when reading the tensor!");
    }

    char c;
-    
+
    do {
        c = fgetc(file);
    } while (c != '\n' && c != EOF);
@@ -1869,7 +1869,7 @@ void XTensor::Read(FILE * file, const char * label)

    int o = 0;
    bool sameSize = true;
-    char * p = dimSizeName;
+    char* p = dimSizeName;
    while (*p != 0) {
        while (*p == ' ' || *p == '\t')
            p++;
@@ -1893,14 +1893,14 @@ void XTensor::Read(FILE * file, const char * label)
    if (!sameSize || dRatio > denseRatio || GetDataType(typeName) != dataType)
        Resize(dimNum, dims, GetDataType(typeName), dRatio);

-    void * dataBuf = XMemAlloc(-1, GetDataSizeInChar());
-    void * dataBackup = data;
+    void* dataBuf = XMemAlloc(-1, GetDataSizeInChar());
+    void* dataBackup = data;
    data = dataBuf;

    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
            for (int i = 0; i < unitNum; i++) {
-                DTYPE * f = ((DTYPE*)data) + i;
+                DTYPE* f = ((DTYPE*)data) + i;
                if (fscanf(file, "%e", f) < 1) {
                    ShowNTErrors("Incorrect tensor format!");
                }
@@ -1943,23 +1943,23 @@ void XTensor::Read(FILE * file, const char * label)
    delete[](char*)dataBuf;
 }

-/* 
+/*
 read data from a binary file
 >>> file - the file stream pointer
 >>> offset - the distance from the start to this tensor
 */
 void XTensor::BinaryRead(FILE* file, size_t offset)
 {
-    fseek(file, offset, 0);
+    //fseek(file, offset, 0);
    switch (dataType) {
    case X_INT: {
-        int * d = new int[unitNum];
+        int* d = new int[unitNum];
        fread(d, sizeof(int), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
    }
    default: {
-        float * d = new float[unitNum];
+        float* d = new float[unitNum];
        fread(d, sizeof(float), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
@@ -1971,7 +1971,7 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
 flush the data to the target device
 >> targetMem - memory pool on the target device
 */
-void XTensor::FlushToMem(XMem * targetMem)
+void XTensor::FlushToMem(XMem* targetMem)
 {
    if (targetMem == NULL)
        return;
@@ -1984,7 +1984,7 @@ void XTensor::FlushToMem(XMem * targetMem)
            CudaCPUToGPUFlush(&l, targetMem->devID, targetMem);
        }
        else if (mem != targetMem) {
-            void * tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
+            void* tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
            XMemCopy(tmpData, targetMem->devID, data, devID, GetDataSizeInChar());
            data = tmpData;
            mem = targetMem;
@@ -2008,29 +2008,29 @@ void XTensor::FlushToMem(XMem * targetMem)
 }

 /*
-allocate the memory space of the tensor (in the global memory) 
+allocate the memory space of the tensor (in the global memory)
 >> tensor - the tensor we intend to process
 >> myMem - the memory pool we are using
 >> useBuf - indicates whether we use the buffer in the memory pool
 */
-void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
+void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if(tensor == NULL)
+    if (tensor == NULL)
        return;

-    if(myMem == NULL){
-        if(tensor->data != NULL)
+    if (myMem == NULL) {
+        if (tensor->data != NULL)
            FreeData(tensor, NULL, false);
        tensor->data = XMemAlloc(tensor->devID, tensor->GetDataSizeInChar());
        tensor->isInGlobalMem = true;
    }
-    else{
+    else {
        CheckNTErrors((tensor->data == NULL), "Cannot renew the space for the tensor");
-        if(useBuf){
+        if (useBuf) {
            tensor->data = myMem->AllocBuf(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = false;
        }
-        else{
+        else {
            tensor->data = myMem->AllocGlobal(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = true;
        }
@@ -2039,22 +2039,22 @@ void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
    tensor->signature = 0;
 }

-/* 
-free the memory space of the tensor (in the global memory) 
+/*
+free the memory space of the tensor (in the global memory)
 >> tensor - the tensor we intend to process
 >> myMem - the memory pool we are using
 >> useBuf - indicates whether we use the buffer in the memory pool
 */
-void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
+void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if(tensor == NULL)
+    if (tensor == NULL)
        return;

-    if(myMem == NULL){
+    if (myMem == NULL) {
        XMemFree(tensor->devID, tensor->data);
    }
-    else{
-        if(tensor->isInGlobalMem)
+    else {
+        if (tensor->isInGlobalMem)
            myMem->ReleaseGlobal(tensor->devID, tensor->data);
        else
            myMem->ReleaseBuf(tensor->devID, tensor->GetDataSizeInChar());
@@ -2065,27 +2065,27 @@ void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
 }

 /* overloading of the plus-sign */
-XTensor operator+ (const DTYPE shift, const XTensor &tensor) 
+XTensor operator+ (const DTYPE shift, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, 1, shift);
 }

 /* overloading of the minus-sign */
-XTensor  operator- (const DTYPE shift, const XTensor &tensor)
+XTensor  operator- (const DTYPE shift, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, 1, -shift);
 }

 /* overloading of the multiply-sign */
-XTensor  operator* (const DTYPE scale, const XTensor &tensor)
+XTensor  operator* (const DTYPE scale, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, scale, 0);
 }

 /* overloading of the division-sign */
-XTensor  operator/ (const DTYPE scale, const XTensor &tensor)
+XTensor  operator/ (const DTYPE scale, const XTensor& tensor)
 {
-    return ScaleAndShift(tensor, (DTYPE)1/scale, 0);
+    return ScaleAndShift(tensor, (DTYPE)1 / scale, 0);
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
                }                                                                                                   \
                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength);                                         \
                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
        else {                                                                                                      \
            /* data is separated */                                                                                 \
            for(int i = 0; i < blockNum; i++){                                                                      \
-                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                for(int j = 0; j < stride / 32; j++){                                                               \
                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
                    VectorBuffer vecBuf[4];                                                                         \

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
    int num = input->dimSize[dim];

    _ReduceSum(input, output, dim);
-    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
 }

 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
                    }
                    for(int j = 1; j < strideNum / 32; j++){
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
+                    for(int j = 0; j < stride / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        }
                        VectorBuffer vecBuf[4];
                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
-
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
                        }
                        for(int k = 1; k < strideNum; k++){
                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;