add support for greedy search

99097e41 · huchi · bfa6fc90 · 99097e41 · 99097e41 · 99097e41
Commit 99097e41 authored Feb 17, 2020 by huchi
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -19,6 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
 */

+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
 #include <stdio.h>
 #include "./network/XNet.h"
 #include "./tensor/XUtility.h"
@@ -27,9 +31,7 @@
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/Transformer.h"

-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
+

 using namespace nts;
 using namespace fnnlm;
@@ -37,19 +39,10 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    //_CrtSetBreakAlloc(2708);
+    /*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    _CrtSetBreakAlloc(2708);*/

    TransformerMain(argc - 1, argv + 1);
-    /*XTensor x;
-    InitTensor2D(&x, 2, 2);
-    float d[]{ 1,2,3,4 };
-    x.SetData(d, 4);
-    XTensor y;
-    y = ReduceSum(x, 0);
-    y.Dump(stderr);*/
-
-
    //_CrtDumpMemoryLeaks();
    
    return 0;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -62,7 +62,7 @@ void T2TAttention::InitModel(int argc, char** argv,

    float minmax = 0;

-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);
    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
@@ -70,15 +70,15 @@ void T2TAttention::InitModel(int argc, char** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
-    InitTensor1D(&bq, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
-    InitTensor2D(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID);
-    InitTensor1D(&ba, d, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bq, d, X_FLOAT, devID);
+    InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bk, d, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bv, d, X_FLOAT, devID);
+    InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
+    InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bo, d, X_FLOAT, devID);
 }

 /*
@@ -94,24 +94,27 @@ make the network
 >> cacheType - which type that cache is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
+XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
 {
    const bool isEnc = (!cache) ? true : false;
    
    /* linear transformation before self-attention */
+    
    XTensor q2, k2, v2;
-    q2 = MatrixMul(q, X_NOTRANS, wq, X_TRANS) + bq;
+
+    q2 = MatrixMul(q, wq) + bq;

    if (!cache) {
        /* self attention for encoder layers */
-        k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-        v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+        k2 = MatrixMul(k, wk) + bk;
+        v2 = MatrixMul(v, wv) + bv;
        return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
    }
+    
    else {
        if (cacheType == SELF_ATT) {
-            k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-            v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+            k2 = MatrixMul(k, wk) + bk;
+            v2 = MatrixMul(v, wv) + bv;

            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -121,12 +124,13 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
            cache->key = k2;
            cache->value = v2;
            cache->miss = false;
+
            return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
        }
        else if (cacheType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-                cache->value = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+                cache->key = MatrixMul(k, wk) + bk;
+                cache->value = MatrixMul(v, wv) + bv;
                cache->miss = false;
            }
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
@@ -145,7 +149,7 @@ make the attention network given keys, queries and values (after linear transfor
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
+XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
 {
    XTensor kheads;
    XTensor qheads;
@@ -163,21 +167,20 @@ XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor*
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    
-    /*if (isMasked && mask) {
-        _SumMe(&dot, mask);
-    }*/
+    /*if (isMasked && mask)
+        _SumMe(&dot, mask);*/
    
    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    scalar = Softmax(dot, -1);

-    /*if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    att = BMMul(scalar, vheads);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 /*
@@ -215,14 +218,12 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
    InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);

    /* generate the relative emb index (L_q, L_kv) */
-    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID,is_encoder);
-
+    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);

    /* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
    _Gather(&rp_embedding_k, &relative_key, &emb_matrix);

    /* RPR dot product (K, B, L_q, L_kv)*/
-    qheads = qheads / float(nhead);

    RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);

@@ -230,19 +231,19 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
        _SumMe(&dot, mask);*/

    /* scale the dot result */
-    //dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    /* softmax */
    scalar = Softmax(dot, -1);

-    /*if (isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    /* generate the relative attention output (K, B, L_q, H/K) */
    att = BMMul(scalar, vheads);
    
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
@@ -251,10 +252,11 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    XTensor range;
    InitTensor1DV2(&range, len_kv, X_INT, devID);
    int* index = new int[len_kv];
+
    // for encoder self-attention which the L_q = L_kv
    if (is_encoder)
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = i;
        range.SetData(index, len_kv);
        XTensor range_2D, range_2D_t;
@@ -267,7 +269,7 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    // for decoder self-attention which the L_q != L_kv, and L_q is 1
    else
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = -len_kv + i + 1;
        range.SetData(index, len_kv);
        _Unsqueeze(&range, emb_matrix, 0, len_q);
@@ -299,7 +301,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    XTensor context;
    InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
    _MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
-    //if (profiler_) profiler_->FinishTimer("RPDotPro-BMM");

    // reshape and transpose x to (L_q, K*B, H/K or L_kv)
    int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
@@ -323,5 +324,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    relative_t.Reshape(4, split_dims);

    _Sum(&context, &relative_t, attention);
+
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -90,14 +90,18 @@ public:
    /* bias for V */
    XTensor bv;

+    XTensor wBig;
+
+    XTensor bBig;
+
    /* RPR emb */
    XTensor rp_embedding_k;

    /* transformation after dot-product attention */
-    XTensor wa;
+    XTensor wo;

    /* bias after dot-product attention */
-    XTensor ba;
+    XTensor bo;

    /* size of transformed Q and K */
    int dk;

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,27 +31,27 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
-    attLayerNorms = NULL;
-    attentionsEnde = NULL;
-    attEndeLayerNorms = NULL;
-    decodeLayerNorm = NULL;
-    selfCache = NULL;
-    contextCache = NULL;
+    selfAttLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
 }

 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
-    delete[] selfCache;
-    delete[] contextCache;
-    delete[] attentions;
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] attentionsEnde;
-    delete[] attEndeLayerNorms;
-    delete decodeLayerNorm;
+    delete[] selfAttLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    delete decoderLayerNorm;
 }

 /* 
@@ -71,7 +71,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;

-    LoadParamInt(argc, argv, "nlayer", &nlayer, 3);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
@@ -83,24 +83,24 @@ void AttDecoder::InitModel(int argc, char ** argv,
    /* embedding model */
    embedder.InitModel(argc, argv, devID, false);

-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
-    decodeLayerNorm = new T2TLN;
-    selfCache = new Cache[nlayer];
-    contextCache = new Cache[nlayer];
+    selfAttLayerNorms = new T2TLN[nlayer];
+    enDeAtt = new T2TAttention[nlayer];
+    enDeAttLayerNorms = new T2TLN[nlayer];
+    decoderLayerNorm = new T2TLN;
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
+        enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    decodeLayerNorm->InitModel(argc, argv, myDevID);
+    decoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -131,48 +131,38 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, X
        XTensor attNorm;

        /* layer normalization */
-        inputNorm = attLayerNorms[i].Make(x);
-        //inputNorm.Dump(stderr, "inputNorm", 10);
+        inputNorm = selfAttLayerNorms[i].Make(x);

        /******************/
        /* self attention */
-        att = attentions[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfCache[i], SELF_ATT);
+        att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);

        /* residual connection */
-        _SumMe(&att, &x);
-        //att.Dump(stderr, "Sum(att, x)", 10);
+        att = att + x;

        /* layer normalization */
-        attNorm = attEndeLayerNorms[i].Make(att);
-        //attNorm.Dump(stderr, "attNorm", 10);
+        attNorm = enDeAttLayerNorms[i].Make(att);

        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &contextCache[i], EN_DE_ATT);
-
-        //ende.Dump(stderr, "ende atten", 10);
+        ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);

        /* residual connection */
-        _SumMe(&ende, &att);
-        //res.Dump(stderr, "Sum(ende, att)", 10);
+        ende = ende + att;

        /* fnn */
        x = fnns[i].Make(ende, isTraining);
-        //x.Dump(stderr, "fnns[i]", 10);

    }

-    x = decodeLayerNorm->Make(x);
-    //x.Dump(stderr, "decodeLayerNorm", 10);
-    
-    x.SetName(DECODING_NAME);
+    x = decoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -63,13 +63,13 @@ public:
    T2TFNN * fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention * selfAtt;

    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    T2TLN * selfAttLayerNorms;

    /* layer normalization for decoder */
-    T2TLN * decodeLayerNorm;
+    T2TLN * decoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;
@@ -78,16 +78,16 @@ public:
    XTensor * output;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    T2TAttention * enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    T2TLN * enDeAttLayerNorms;

    /* layer cache list */
-    Cache* selfCache;
+    Cache* selfAttCache;

    /* layer cache list */
-    Cache* contextCache;
+    Cache* enDeAttCache;

 public:
    /* constructor */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "pad", &padIdx, 1);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    maxLength = maxLength + 1 + 1;
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
@@ -80,7 +80,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -113,47 +113,47 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input, int prevLen)
 {
-    /* assert padding index is 1 */
+    ///* assert padding index is 1 */

-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    //CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    //CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    //CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    //CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

+    //
+    //XTensor wordEmbedding, position, posEmbedding;
+    //InitTensor(&position, &input);

-    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
+    //int* posData = new int[input.unitNum];

-    int* posData = new int[input.unitNum];
+    //XTensor inputCPU;
+    //InitTensorOnCPU(&inputCPU, &input);
+    //_CopyValues(&input, &inputCPU);

-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);

+    //for (int i = 0; i < inputCPU.GetDim(0); i++) {
+    //    int startNoPad = 2 + prevLen - 1;
+    //    int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
+    //    for (int j = 0; j < inputCPU.GetDim(1); j++) {
+    //        if (p[j] == 1) {
+    //            posData[i * inputCPU.GetDim(1) + j] = 1;
+    //        }
+    //        else {
+    //            posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
+    //        }
+    //    }
+    //}

-    for (int i = 0; i < inputCPU.GetDim(0); i++) {
-        int startNoPad = 2 + prevLen - 1;
-        int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-        for (int j = 0; j < inputCPU.GetDim(1); j++) {
-            if (p[j] == 1) {
-                posData[i * inputCPU.GetDim(1) + j] = 1;
-            }
-            else {
-                posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-            }
-        }
-    }
+    //position.SetData(posData, position.unitNum);
+    //delete[] posData;

-    position.SetData(posData, position.unitNum);
-    delete[] posData;
-
-    /* we make positional embeddings first */
-    if(true){
-        posEmbedding = Gather(posEmbeddingBase, position);
-    }
+    ///* we make positional embeddings first */
+    //if(true){
+    //    posEmbedding = Gather(posEmbeddingBase, position);
+    //}

    /* then we make word embeddings */
-
+    XTensor wordEmbedding;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -29,7 +29,7 @@ using namespace nts;
 namespace transformer
 {

-#define DEFAULT_EMBEDDING_SIZE 512
+#define DEFAULT_EMBEDDING_SIZE 128

 /* 
 embedding (of word at position i):

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -34,7 +34,7 @@ AttEncoder::AttEncoder()
    attentions = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
-    encodeLayerNorm = NULL;
+    encoderLayerNorm = NULL;
 }

 /* de-constructor */
@@ -43,7 +43,7 @@ AttEncoder::~AttEncoder()
    delete[] attentions;
    delete[] fnns;
    delete[] attLayerNorms;
-    delete encodeLayerNorm;
+    delete encoderLayerNorm;
 }

 /* 
@@ -61,7 +61,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;
    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 35);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
@@ -76,7 +76,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
-    encodeLayerNorm = new T2TLN;
+    encoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
@@ -84,7 +84,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
        fnns[i].InitModel(argc, argv, myDevID);
        attLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    encodeLayerNorm->InitModel(argc, argv, myDevID);
+    encoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -123,13 +123,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, boo

        /* fnn */
        x = fnns[i].Make(res, isTraining);
-
    }

-    x = encodeLayerNorm->Make(x);
-
-    x.SetName(ENCODING_NAME);
-    input.SetName(ENCODING_INPUT_NAME);
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -93,11 +93,11 @@ public:
    /* attention model of each layer */
    T2TAttention * attentions;

-    /* layer normalization for attention */
+    /* layer normalizations for attention */
    T2TLN * attLayerNorms;

    /* layer normalization for encoder */
-    T2TLN * encodeLayerNorm;
+    T2TLN * encoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
  */

@@ -48,7 +48,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myDevID - device id
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
+void T2TFNN::InitModel(int argc, char** argv, int myDevID)
 {
    devID = myDevID;

@@ -56,14 +56,14 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2DV2(&w1, hSize, inSize, X_FLOAT, devID);
+    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2DV2(&w2, outSize, hSize, X_FLOAT, devID);
+    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    fnnLayerNorm.InitModel(argc, argv, myDevID);
@@ -84,19 +84,19 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), X_NOTRANS, w1, X_TRANS, b1));
+    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));

-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
    XTensor res;
-    res = MulAndShift(t1, X_NOTRANS, w2, X_TRANS, b2);
+    res = MulAndShift(t1, w2, b2);
    _SumMe(&res, &input);
    return  res;
 }

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -53,8 +53,8 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID)
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
 }

 /*
@@ -78,7 +78,7 @@ XTensor T2TLN::Make(XTensor &input)
    mean = ReduceMean(x, x.order - 1);

    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean);
+    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
  */

@@ -24,6 +24,7 @@
 #include "T2TUtility.h"
 #include "../../tensor/core/CHeader.h"
 #include "../../tensor/XUtility.h"
+#include <cstdint>

 namespace transformer
 {
@@ -54,17 +55,17 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TModel::InitModel(int argc, char ** argv)
+void T2TModel::InitModel(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "dev", &devID, -1);
    LoadParamBool(argc, argv, "mt", &isMT, false);
    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);

    encoder->InitModel(argc, argv, true, 0, devID);
    outputLayer->InitModel(argc, argv, devID);

-    if(isMT)
+    if (isMT)
        decoder->InitModel(argc, argv, true, 0, devID);

    TensorList params(10);
@@ -83,7 +84,7 @@ make the encoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeEncoder(XTensor &input, XTensor *mask, bool isTraining)
+XTensor T2TModel::MakeEncoder(XTensor& input, XTensor* mask, bool isTraining)
 {
    XTensor nothing;

@@ -100,7 +101,7 @@ make the decoding network
 >> isTraining - indicates whether we are training the model
 << return - encoding result
 */
-XTensor T2TModel::MakeDecoder(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, XTensor &maskEncDec, bool isTraining)
+XTensor T2TModel::MakeDecoder(XTensor& inputDec, XTensor& outputEnc, XTensor* mask, XTensor& maskEncDec, bool isTraining)
 {
    return decoder->Make(inputDec, outputEnc, mask, maskEncDec, isTraining);
 }
@@ -112,7 +113,7 @@ make the network for language modeling (with the output softmax layer)
 >> padding - padding of the sequences
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining)
+void T2TModel::MakeLM(XTensor& input, XTensor& output, XTensor& padding, bool isTraining)
 {
    XTensor encoding;

@@ -126,13 +127,13 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    //XTensor mask(input.order + 1, dims, X_FLOAT, 1.0F, input.devID, input.mem);

    int len = input.GetDim(input.order - 1);
-    int * dims = new int[input.order + 2];
-    for(int i = 0; i < input.order; i++)
+    int* dims = new int[input.order + 2];
+    for (int i = 0; i < input.order; i++)
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
    dims[input.order + 1] = len;
    XTensor mask;
-    InitTensor(&mask, input.order + 2, dims, X_FLOAT, padding.devID);
+    InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, 1.0F, padding.devID);

    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
@@ -140,16 +141,16 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    _SetDataLowTri(&mask, 1e9F, 0);
    _ScaleAndShiftMe(&mask, 1.0F, -1e9F);

-    int * dimsPadding = new int[padding.order + 2];
-    for(int i = 0; i < padding.order - 1; i++)
+    int* dimsPadding = new int[padding.order + 2];
+    for (int i = 0; i < padding.order - 1; i++)
        dimsPadding[i] = padding.GetDim(i);
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
+    XTensor* padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
        padding.devID);

-    for(int i = 0; i < padding2->order; i++)
+    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

@@ -183,7 +184,7 @@ make the network for machine translation (with the output softmax layer)
 >> paddingDec - padding of the sequences (on the decoder side)
 >> isTraining - indicates whether the model is for training
 */
-void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining)
+void T2TModel::MakeMT(XTensor& inputEnc, XTensor& inputDec, XTensor& output, XTensor& paddingEnc, XTensor& paddingDec, bool isTraining)
 {
    XTensor encoding;
    XTensor decoding;
@@ -214,17 +215,17 @@ make the mask for training MT models
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec, 
-                          XTensor &paddingEnc, XTensor &paddingDec, 
-                          XTensor &maskEnc,    XTensor &maskDec,    XTensor &maskEncDec)
+void T2TModel::MakeMTMask(XTensor& inputEnc, XTensor& inputDec,
+    XTensor& paddingEnc, XTensor& paddingDec,
+    XTensor& maskEnc, XTensor& maskDec, XTensor& maskEncDec)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
-    int * dims = new int[inputDec.order + 2];
-    for(int i = 0; i < inputDec.order; i++)
+    int* dims = new int[inputDec.order + 2];
+    for (int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID);

    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -234,11 +235,10 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID);

-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -248,20 +248,20 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    DelTensorBuf(maskEncDecTMPEnc);

    /* padding on the source side */
-    int * dimsPadding = new int[paddingEnc.order + 2];
+    int* dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
        dimsPadding[i] = paddingEnc.GetDim(i);
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
        paddingEnc.devID);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+    XTensor* padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
        paddingEnc.devID);

    /* mask of the padding */
@@ -270,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);

-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();

    /* generate the mask on the source language side (for padding) */
@@ -289,24 +289,22 @@ make the mask of the encoder
 >> paddingEnc - padding of the encoder input
 >> maskEnc - mask of the encoder self-attention
 */
-void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
+void T2TModel::MakeMTMaskEnc(XTensor& inputEnc, XTensor& paddingEnc, XTensor& maskEnc)
 {
    /* padding on the source side */
-    int * dimsPadding = new int[paddingEnc.order + 2];
+    int* dimsPadding = new int[paddingEnc.order + 2];
    for (int i = 0; i < paddingEnc.order - 1; i++)
        dimsPadding[i] = paddingEnc.GetDim(i);
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+    XTensor* padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType, paddingEnc.devID);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                        paddingEnc.devID);
+    XTensor* padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType, paddingEnc.devID);

    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -314,7 +312,7 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma

    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);

-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();

    /* generate the mask on the source language side (for padding) */
@@ -334,13 +332,13 @@ make the mask of the decoder
 >> maksDec - mask of the decoder self-attention
 >> maksEncDec - mask of the decoder enc-dec attention
 */
-void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
-                             XTensor &paddingEnc, XTensor &paddingDec,
-                             XTensor &maskDec, XTensor &maskEncDec, int incDim)
+void T2TModel::MakeMTMaskDec(XTensor& inputEnc, XTensor& inputDec,
+    XTensor& paddingEnc, XTensor& paddingDec,
+    XTensor& maskDec, XTensor& maskEncDec, int incDim)
 {
    int len = inputDec.GetDim(inputDec.order - 1);
-    int * dims = new int[inputDec.order + 2];
-    for(int i = 0; i < inputDec.order; i++)
+    int* dims = new int[inputDec.order + 2];
+    for (int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    //dims[inputDec.order] += incDim;
    dims[0] = nhead;
@@ -356,11 +354,10 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,

       /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID);

-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                                paddingEnc.devID);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID);
+    XTensor* maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);
+    XTensor* maskEncDecTMPDec = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));

@@ -383,12 +380,12 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(TensorList &list)
+void T2TModel::GetParams(TensorList& list)
 {
    list.Clear();

    /* encoder parameters */
-    for(int i = 0; i < encoder->nlayer; i++){
+    for (int i = 0; i < encoder->nlayer; i++) {
        list.Add(&encoder->attentions[i].wq);
        list.Add(&encoder->attentions[i].wk);
        list.Add(&encoder->attentions[i].wv);
@@ -396,8 +393,8 @@ void T2TModel::GetParams(TensorList &list)
        list.Add(&encoder->attentions[i].bk);
        list.Add(&encoder->attentions[i].bv);
        list.Add(&encoder->attentions[i].rp_embedding_k);
-        list.Add(&encoder->attentions[i].wa);
-        list.Add(&encoder->attentions[i].ba);
+        list.Add(&encoder->attentions[i].wo);
+        list.Add(&encoder->attentions[i].bo);
        list.Add(&encoder->fnns[i].w1);
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
@@ -407,33 +404,33 @@ void T2TModel::GetParams(TensorList &list)
        list.Add(&encoder->fnns[i].fnnLayerNorm.w);
        list.Add(&encoder->fnns[i].fnnLayerNorm.b);
    }
-    list.Add(&encoder->encodeLayerNorm->w);
-    list.Add(&encoder->encodeLayerNorm->b);
+    list.Add(&encoder->encoderLayerNorm->w);
+    list.Add(&encoder->encoderLayerNorm->b);

    /* decoder parameters */
-    if(isMT){
-        for(int i = 0; i < decoder->nlayer; i++){
-            list.Add(&decoder->attentions[i].wq);
-            list.Add(&decoder->attentions[i].wk);
-            list.Add(&decoder->attentions[i].wv);
-            list.Add(&decoder->attentions[i].bq);
-            list.Add(&decoder->attentions[i].bk);
-            list.Add(&decoder->attentions[i].bv);
-            list.Add(&decoder->attentions[i].rp_embedding_k);
-            list.Add(&decoder->attentions[i].wa);
-            list.Add(&decoder->attentions[i].ba);
-            list.Add(&decoder->attLayerNorms[i].w);
-            list.Add(&decoder->attLayerNorms[i].b);
-            list.Add(&decoder->attentionsEnde[i].wq);
-            list.Add(&decoder->attentionsEnde[i].wk);
-            list.Add(&decoder->attentionsEnde[i].wv);
-            list.Add(&decoder->attentionsEnde[i].bq);
-            list.Add(&decoder->attentionsEnde[i].bk);
-            list.Add(&decoder->attentionsEnde[i].bv);
-            list.Add(&decoder->attentionsEnde[i].wa);
-            list.Add(&decoder->attentionsEnde[i].ba);
-            list.Add(&decoder->attEndeLayerNorms[i].w);
-            list.Add(&decoder->attEndeLayerNorms[i].b);
+    if (isMT) {
+        for (int i = 0; i < decoder->nlayer; i++) {
+            list.Add(&decoder->selfAtt[i].wq);
+            list.Add(&decoder->selfAtt[i].wk);
+            list.Add(&decoder->selfAtt[i].wv);
+            list.Add(&decoder->selfAtt[i].bq);
+            list.Add(&decoder->selfAtt[i].bk);
+            list.Add(&decoder->selfAtt[i].bv);
+            list.Add(&decoder->selfAtt[i].rp_embedding_k);
+            list.Add(&decoder->selfAtt[i].wo);
+            list.Add(&decoder->selfAtt[i].bo);
+            list.Add(&decoder->selfAttLayerNorms[i].w);
+            list.Add(&decoder->selfAttLayerNorms[i].b);
+            list.Add(&decoder->enDeAtt[i].wq);
+            list.Add(&decoder->enDeAtt[i].wk);
+            list.Add(&decoder->enDeAtt[i].wv);
+            list.Add(&decoder->enDeAtt[i].bq);
+            list.Add(&decoder->enDeAtt[i].bk);
+            list.Add(&decoder->enDeAtt[i].bv);
+            list.Add(&decoder->enDeAtt[i].wo);
+            list.Add(&decoder->enDeAtt[i].bo);
+            list.Add(&decoder->enDeAttLayerNorms[i].w);
+            list.Add(&decoder->enDeAttLayerNorms[i].b);
            list.Add(&decoder->fnns[i].w1);
            list.Add(&decoder->fnns[i].b1);
            list.Add(&decoder->fnns[i].w2);
@@ -441,8 +438,8 @@ void T2TModel::GetParams(TensorList &list)
            list.Add(&decoder->fnns[i].fnnLayerNorm.w);
            list.Add(&decoder->fnns[i].fnnLayerNorm.b);
        }
-        list.Add(&decoder->decodeLayerNorm->w);
-        list.Add(&decoder->decodeLayerNorm->b);
+        list.Add(&decoder->decoderLayerNorm->w);
+        list.Add(&decoder->decoderLayerNorm->b);
    }

    /* shared embeddings */
@@ -456,19 +453,19 @@ dump the parameters
 >> fn - where to keep the model
 >> model - the model
 */
-void T2TModel::Dump(const char * fn)
+void T2TModel::Dump(const char* fn)
 {
    double startT = GetClockSec();

-    FILE * file = fopen(fn, "wb");
+    FILE* file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");

    TensorList params(100);

    GetParams(params);

-    for(int i = 0; i < params.count; i++){
-        XTensor * p = (XTensor*)params.Get(i);
+    for (int i = 0; i < params.count; i++) {
+        XTensor* p = (XTensor*)params.Get(i);
        p->Dump(file, "param:");
    }

@@ -480,38 +477,37 @@ void T2TModel::Dump(const char * fn)
 }

 /* read the parameters */
-void T2TModel::Read(const char * fn)
+void T2TModel::Read(const char* fn)
 {
    double startT = GetClockSec();

-    FILE * file = fopen(fn, "rb");
+    FILE* file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot open the model file");

    TensorList params(100);
-
    GetParams(params);
    
-    for(int i = 0; i < params.count; i++){
-        XTensor * p = (XTensor*)params.Get(i);
-        FastRead(p, file);
-        // p->Read(file, "");
-    }
+    //uint64_t* offsets = new uint64_t[params.Size()];

-    fclose(file);
-
-    double elapsed = GetClockSec() - startT;
+    ///* number of parameter */
+    //uint64_t param_number;
+    //fread(&param_number, sizeof(param_number), 1, file);
+    //CheckNTErrors(param_number == params.Size(), "parameter number not matched");

-    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
-}
+    ///* parameter offsets */
+    //fread(offsets, sizeof(offsets[0]), params.Size(), file);

-void FastRead(XTensor* x, FILE* f) {
-    float * dataBuf = new float[x->unitNum];
+    ///* parameter values */
+    //for (int i = 0; i < params.Size(); i++)
+    //    params[i]->BinaryRead(file, offsets[i]);

-    fread(dataBuf, sizeof(char), sizeof(float) * x->unitNum, f);
+    //delete[] offsets;
+    for (int i = 0; i < params.Size(); i++)
+        params[i]->BinaryRead(file, 0);

-    x->SetData(dataBuf, x->unitNum);
-
-    delete[] dataBuf;
+    fclose(file);
+    double elapsed = GetClockSec() - startT;
+    XPRINT1(0, stderr, "[INFO] model loaded (took %.1fs)\n", elapsed);
 }

 }
\ No newline at end of file
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -103,7 +103,7 @@ public:
    /* read the parameters */
    void Read(const char * fn);
 };
-void FastRead(XTensor* x, FILE* f);
+
 }

 #endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,13 +56,11 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
 }


-
 /* 
 make the network (redefined output tensor) 
 >> input - input tensor
@@ -72,9 +70,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_NOTRANS), -1);
-
-    output.SetName(OUTPUT_NAME);
+    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
 }

 }
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
  */

@@ -38,7 +38,7 @@ T2TStateBundle::T2TStateBundle()
 /* de-constructor */
 T2TStateBundle::~T2TStateBundle()
 {
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
 }

@@ -50,12 +50,12 @@ void T2TStateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

-    if(states != NULL)
+    if (states != NULL)
        delete[] states;

    states = new T2TState[num];

-    for(int i = 0; i < num; i++){
+    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
        states[i].pid = T2T_PID_EMPTY;
        states[i].isEnd = false;
@@ -74,7 +74,7 @@ void T2TStateBundle::MakeStates(int num)
 /* constructor */
 T2TPredictor::T2TPredictor()
 {
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
@@ -90,29 +90,36 @@ create an initial state
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);

-    float* data = new float[state->probPath.unitNum];
+    /*float* data = new float[state->probPath.unitNum];
    for (int i = 0; i < state->probPath.unitNum; ++i) {
        data[i] = -1e20F;
        if (i % beamSize == 0)
            data[i] = 0;
    }
    state->probPath.SetData(data, state->probPath.unitNum);
+    delete[] data;*/
+
+    SetDataFixed(state->probPath, -1e9F);
+    for (int i = 0; i < state->probPath.unitNum; ++i) {
+        if (i % beamSize == 0)
+            state->probPath.Set(0.0F, i);
+    }
    
    state->nstep.SetZeroAll();
    state->endMark.SetZeroAll();

-    delete[] data;
+    
    state->stateNum = 0;
 }

@@ -133,7 +140,7 @@ read a state
             2) probablities of hypotheses
             3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
 {
    m = model;
    s = state;
@@ -147,8 +154,7 @@ predict the next state
 >> paddingEnc - padding of the encoder
 >>> isStart - is the start or not
 */
-void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
-                           XTensor * inputEnc, XTensor * paddingEnc, bool isStart)
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -159,40 +165,41 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    XTensor first;

    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for(int i = 0; i < inputEnc->order - 1; i++)
+    for (int i = 0; i < inputEnc->order - 1; i++)
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
    SetDataFixedInt(first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
    if (isStart) {
        inputDec = Identity(first);
    }
-    else{
+    else {
        /* only pass one step to the decoder */
        inputDec = GetLastPrediction(s);
        inputDec.SetDevice(inputEnc->devID);
    }
    
+
    /* prediction probabilities */
-    XTensor &output = next->prob;
+    XTensor& output = next->prob;
    XTensor decoding;

-    for(int i = 0; i < inputDec.order - 1; i++)
+    for (int i = 0; i < inputDec.order - 1; i++)
        dims[i] = inputDec.GetDim(i);
    dims[inputDec.order - 1] = inputDec.GetDim(-1);

    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);

    XTensor maskDec;
    XTensor maskEncDec;

    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);
+    //m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);

    /* make the decoding network */
    decoding = m->decoder->Make(inputDec, *encoding, NULL, maskEncDec, false);
@@ -207,34 +214,34 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
 generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            cur = cur->last;
        }

-        if(nsteps > distance)
+        if (nsteps > distance)
            distance = nsteps;
    }

    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            path.Set2DInt(cur->prediction, i, distance - nsteps);
            cur = cur->last;
@@ -253,7 +260,7 @@ XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    XTensor lastPred;
-    InitTensor2D(&lastPred, state->stateNum, 1, X_INT);
+    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);

    for (int i = 0; i < state->stateNum; i++) {
        T2TState* cur = state->states + i;

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
  * This is the first source file I create in 2019 - new start!
  */
@@ -66,7 +66,7 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState * last;
+    T2TState* last;
 };

 /* a bundle of states */
@@ -95,7 +95,7 @@ public:
    XTensor nstep;

    /* list of states */
-    T2TState * states;
+    T2TState* states;

    /* number of states */
    int stateNum;
@@ -123,10 +123,10 @@ class T2TPredictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel * m;
+    T2TModel* m;

    /* current state */
-    T2TStateBundle * s;
+    T2TStateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,19 +139,19 @@ public:
    ~T2TPredictor();

    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel * model, T2TStateBundle * state);
+    void Read(T2TModel* model, T2TStateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc, bool isStart);
+    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle * state);
+    XTensor GeneratePaths(T2TStateBundle* state);

    /* get the predictions of the previous step */
    XTensor GetLastPrediction(T2TStateBundle* state);

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  */

@@ -38,15 +38,15 @@ T2TSearch::T2TSearch()
    endSymbolNum = 0;
    fullHypos = NULL;
    endSymbols = new int[32];
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
 T2TSearch::~T2TSearch()
 {
-    if(fullHypos != NULL)
+    if (fullHypos != NULL)
        delete[] fullHypos;
-    if(endSymbols != NULL)
+    if (endSymbols != NULL)
        delete[] endSymbols;
 }

@@ -55,7 +55,7 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 */
-void T2TSearch::Init(int argc, char ** argv)
+void T2TSearch::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "beamsize", &beamSize, 1);
    LoadParamInt(argc, argv, "batchsize", &batchSize, 1);
@@ -63,7 +63,7 @@ void T2TSearch::Init(int argc, char ** argv)
    LoadParamInt(argc, argv, "endid", endSymbols, 2);
    LoadParamInt(argc, argv, "startid", &startSymbol, 2);

-    if(endSymbols[0] >= 0)
+    if (endSymbols[0] >= 0)
        endSymbolNum = 1;
 }

@@ -74,7 +74,7 @@ search for the most promising states
 >> padding - padding of the input
 >> output - output that represents the sequences as rows
 */
-void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output)
+void T2TSearch::Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output)
 {
    T2TPredictor predictor;
    XTensor maskEnc;
@@ -86,10 +86,10 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
    CheckNTErrors(endSymbolNum > 0, "The search class is not initialized!");
    CheckNTErrors(startSymbol >= 0, "The search class is not initialized!");

-    Prepare(input->unitNum/input->GetDim(-1), beamSize);
+    Prepare(input->unitNum / input->GetDim(-1), beamSize);

    /* encoder mask */
-    model->MakeMTMaskEnc(*input, *padding, maskEnc);
+    //model->MakeMTMaskEnc(*input, *padding, maskEnc);

    /* make the encoding network */
    encoding = model->MakeEncoder(*input, &maskEnc, false);
@@ -118,7 +118,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
    first->isStart = true;

    /* generate the sequence from left to right */
-    for(int i = 0 ; i < maxLength; i++){
+    for (int i = 0; i < maxLength; i++) {
        cur = states + i;
        next = states + i + 1;

@@ -126,7 +126,7 @@ void T2TSearch::Search(T2TModel * model, XTensor * input, XTensor * padding, XTe
        predictor.Read(model, cur);

        /* predict the next state */
-        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, i==0);
+        predictor.Predict(next, &encodingBeam, &inputBeam, &paddingBeam, i == 0);

        /* compute the model score (given the prediction probability) */
        Score(cur, next);
@@ -173,59 +173,56 @@ compute the model score for each hypothesis
 >> prev - the beam of the previous state
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
+void T2TSearch::Score(T2TStateBundle* prev, T2TStateBundle* beam)
 {
-    XTensor &score = beam->modelScore;
-    XTensor &prob = beam->prob;
-    XTensor &probPath = beam->probPath;
-    XTensor &probPathPrev = prev->probPath;
-    XTensor &lenPrev = prev->nstep;
-    XTensor &len = beam->nstep;
+    XTensor& score = beam->modelScore;
+    XTensor& prob = beam->prob;
+    XTensor& probPath = beam->probPath;
+    XTensor& probPathPrev = prev->probPath;
+    XTensor& lenPrev = prev->nstep;
+    XTensor& len = beam->nstep;
    XTensor lp;
    XTensor mask;

    int order = prob.order;
    int outputSize = prob.GetDim(-1);
    int dims[MAX_TENSOR_DIM_NUM];
-    for(int i = 0; i < order; i++)
+    for (int i = 0; i < order; i++)
        dims[i] = prob.GetDim(i);

-    InitTensor(&score, &prob);
-    InitTensor(&probPath, &prob);
+    InitTensorV2(&score, &prob);
+    InitTensorV2(&probPath, &prob);

-    prob.Reshape(prob.unitNum/outputSize, outputSize);
-    score.Reshape(score.unitNum/outputSize, outputSize);
+    prob.Reshape(prob.unitNum / outputSize, outputSize);
+    score.Reshape(score.unitNum / outputSize, outputSize);
    probPath.Reshape(score.unitNum / outputSize, outputSize);
    probPathPrev.Reshape(probPathPrev.unitNum);

    /* the log-scale probability of the entire sequence */
    _SumDim(&prob, &probPathPrev, &probPath, 0);
-
-
-    InitTensor(&len, &lenPrev);
-    InitTensor(&lp, &lenPrev);
+    InitTensorV2(&len, &lenPrev);
+    InitTensorV2(&lp, &lenPrev);

    _ScaleAndShift(&lenPrev, &len, 1.0F, 1.0F);

    /* the GNMT-like length penalty */
-    //lp = T2TLengthPenalizer::GNMT(len, alpha);
+    lp = T2TLengthPenalizer::GNMT(len, alpha);

-    //lp.Reshape(lp.unitNum);
+    lp.Reshape(lp.unitNum);

    /* score = log-prob/lp */
-    //_DivDim(&probPath, &lp, &score, 0);
+    _DivDim(&probPath, &lp, &score, 0);

    if (prev->isStart) {
-        XTensor firstMask = MakeFirstMask(beam);
+        XTensor firstMask;
+        firstMask = MakeFirstMask(beam);
        firstMask.Reshape(firstMask.unitNum);

        /* mask the hypotheses in the beam except the first one */
        _SumDim(&score, &firstMask, &score, 0);
    }

-    InitTensor(&mask, 
-               prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 
-               prev->endMark.devID);
+    InitTensorV2(&mask, prev->endMark.order, prev->endMark.dimSize, X_FLOAT, 1.0F, prev->endMark.devID);
    mask.SetZeroAll();
    _SetDataFixedCond(&mask, &prev->endMark, -1e9F);

@@ -235,30 +232,31 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
       be involved in further sorting and beam search. */
    _SumDim(&score, &mask, &score, 0);
    
+
    prob.Reshape(order, dims);
    score.Reshape(order, dims);
    probPath.Reshape(order, dims);
    probPathPrev.Reshape(order - 1, dims);
    lp.Reshape(order - 1, dims);
-    mask.Reshape(order -1 , dims);
+    mask.Reshape(order - 1, dims);
 }

 /*
 generate tokens for the next state via beam pruning
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Generate(T2TStateBundle * beam)
+void T2TSearch::Generate(T2TStateBundle* beam)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int dimsBeam[MAX_TENSOR_DIM_NUM];
    int dimsTopK[MAX_TENSOR_DIM_NUM];

    XTensor scoreTopK;
-    XTensor &score = beam->modelScore;
-    XTensor &index = beam->prediction;
-    XTensor &preID = beam->preID;
-    XTensor &probPath = beam->probPath;
-    XTensor &prob = beam->prob;
+    XTensor& score = beam->modelScore;
+    XTensor& index = beam->prediction;
+    XTensor& preID = beam->preID;
+    XTensor& probPath = beam->probPath;
+    XTensor& prob = beam->prob;
    int order = score.order;

    CheckNTErrors(order >= 3, "The tensor must be of order 2 or larger.");
@@ -278,14 +276,14 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    dimsTopK[order - 3] = dimsBeam[order - 3];
    dimsTopK[order - 1] = beamSize;

-    InitTensor(&scoreTopK, order, dimsTopK, score.dataType, score.devID);
-    InitTensor(&index, order, dimsTopK, X_INT, score.devID);
-    InitTensor(&preID, order, dimsTopK, X_INT, -1);
+    InitTensorV2(&scoreTopK, order, dimsTopK, score.dataType, 1.0F, score.devID);
+    InitTensorV2(&index, order, dimsTopK, X_INT, 1.0F, score.devID);
+    InitTensorV2(&preID, order, dimsTopK, X_INT, 1.0F, -1);

    /* mask the first and the padding id */
    int dimMask[]{ score.GetDim(-1) };
    XTensor mask;
-    InitTensor(&mask, 1, dimMask, X_FLOAT, -1);
+    InitTensorV2(&mask, 1, dimMask, X_FLOAT, 1.0F, -1);
    mask.SetZeroAll();
    mask.Set1D(-1e20F, 0);
    mask.Set1D(-1e20F, 1);
@@ -315,7 +313,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    score.Reshape(order, dims);

    /* we keep the top-k scores */
-    InitTensor(&score, &scoreTopK);
+    InitTensorV2(&score, &scoreTopK);
    CopyValues(scoreTopK, score);

    /*  CPU data (TODO: remove GPU->CPU data copy!!!) */
@@ -334,9 +332,9 @@ void T2TSearch::Generate(T2TStateBundle * beam)

    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
-    InitTensor(&probPathTopK, &scoreTopK);
+    InitTensorV2(&probPathTopK, &scoreTopK);
    XTensor probTopK;
-    InitTensor(&probTopK, &scoreTopK);
+    InitTensorV2(&probTopK, &scoreTopK);

    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
@@ -366,19 +364,19 @@ void T2TSearch::Generate(T2TStateBundle * beam)
 expand the search graph
 >> beam - the beam that keeps a number of states
 */
-void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
+void T2TSearch::Expand(T2TStateBundle* prev, T2TStateBundle* beam)
 {
    CheckNTErrors(beam->prediction.unitNum == beam->preID.unitNum, "A problem occurs in the beam!");

    beam->MakeStates(beam->prediction.unitNum);

-    T2TState * states = beam->states;
-    XTensor & idRef = beam->preID;
-    XTensor & modelScoreRef = beam->modelScore;
-    XTensor & probRef = beam->prob;
-    XTensor & probPathRef = beam->probPath;
-    XTensor & predictionRef = beam->prediction;
-    XTensor & endMark = beam->endMark;
+    T2TState* states = beam->states;
+    XTensor& idRef = beam->preID;
+    XTensor& modelScoreRef = beam->modelScore;
+    XTensor& probRef = beam->prob;
+    XTensor& probPathRef = beam->probPath;
+    XTensor& predictionRef = beam->prediction;
+    XTensor& endMark = beam->endMark;
    XTensor   id;
    XTensor   modelScore;
    XTensor   prob;
@@ -392,7 +390,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
    InitTensorOnCPU(&probPath, &probPathRef);
    InitTensorOnCPU(&prediction, &predictionRef);
    InitTensorOnCPU(&endMarkCPU, &predictionRef);
-    InitTensor(&endMark, &predictionRef);
+    InitTensorV2(&endMark, &predictionRef);

    /* we copy the data to CPU because the frequent access to GPU is slow
       and we can speed-up the process by doing the job on CPU. */
@@ -408,14 +406,14 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
       maintained on CPUs to ease the implementation of frequent access and
       modification of the states. An alternative is to do this on GPUs but
       it needs much more coding work and the speed-up is not obvious. */
-    for(int i = 0; i < beam->stateNum; i += beamSize){
+    for (int i = 0; i < beam->stateNum; i += beamSize) {
        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
-            T2TState & state = states[k];
+            T2TState& state = states[k];

            int offset = id.GetInt(k);
            int pid = i / beamSize;
-            T2TState * last = prev->states + pid * beamSize + offset;
+            T2TState* last = prev->states + pid * beamSize + offset;

            CheckNTErrors(offset >= 0, "Wrong state index!");

@@ -462,12 +460,12 @@ collect hypotheses with ending symbols. Given a beam of hypotheses,
 we remove the finished hypotheses and keep them in a heap.
 >> beam  - the beam that keeps a number of states
 */
-void T2TSearch::Collect(T2TStateBundle * beam)
+void T2TSearch::Collect(T2TStateBundle* beam)
 {
-    T2TState * states = beam->states;
+    T2TState* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
+        T2TState& state = states[i];

        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
            "Invalid sample id!");
@@ -477,7 +475,7 @@ void T2TSearch::Collect(T2TStateBundle * beam)
        bool isCompleted = state.isCompleted && (state.last == NULL || !state.last->isCompleted);

        /* we push the hypothesis into the heap when it is completed */
-        if(state.isEnd != 0)
+        if (state.isEnd != 0)
            fullHypos[state.pid].Push(HeapNode<float>(&state, state.modelScore));
    }
 }
@@ -486,16 +484,16 @@ void T2TSearch::Collect(T2TStateBundle * beam)
 fill the hypotheis heap with incomplete hypotheses
 >> beam  - the beam that keeps a number of states (final)
 */
-void T2TSearch::FillHeap(T2TStateBundle * beam)
+void T2TSearch::FillHeap(T2TStateBundle* beam)
 {
-    bool * emptyFlags = new bool[batchSize];
+    bool* emptyFlags = new bool[batchSize];
    for (int i = 0; i < batchSize; i++)
        emptyFlags[i] = (fullHypos[i].Count() == 0);

-    T2TState * states = beam->states;
+    T2TState* states = beam->states;

    for (int i = 0; i < beam->stateNum; i++) {
-        T2TState & state = states[i];
+        T2TState& state = states[i];

        CheckNTErrors(state.pid >= 0 && state.pid < batchSize,
            "Invalid sample id!");
@@ -512,28 +510,28 @@ void T2TSearch::FillHeap(T2TStateBundle * beam)
 save the output sequences in a tensor
 >> output - output sequences (for return)
 */
-void T2TSearch::Dump(XTensor * output)
+void T2TSearch::Dump(XTensor* output)
 {
-    int dims[3] = {batchSize, beamSize, maxLength};
-    int * words = new int[maxLength];
+    int dims[3] = { batchSize, beamSize, maxLength };
+    int* words = new int[maxLength];

-    InitTensor(output, 3, dims, X_INT);
+    InitTensorV2(output, 3, dims, X_INT);
    SetDataFixedInt(*output, -1);

    /* heap for an input sentence in the batch */
-    for(int h = 0; h < batchSize; h++){
+    for (int h = 0; h < batchSize; h++) {

-        XHeap<MIN_HEAP, float> &heap = fullHypos[h];
+        XHeap<MIN_HEAP, float>& heap = fullHypos[h];

        /* for each output in the beam */
-        for(int i = 0; i < beamSize && heap.Count() > 0; i++){
-            T2TState * state = (T2TState *)heap.Pop().index;
+        for (int i = 0; i < beamSize && heap.Count() > 0; i++) {
+            T2TState* state = (T2TState*)heap.Pop().index;

            int count = 0;
            bool isCompleted = true;

            /* we track the state from the end to the beginning */
-            while(state != NULL){
+            while (state != NULL) {
                if (!state->isCompleted)
                    isCompleted = false;
                if (isCompleted)
@@ -544,7 +542,7 @@ void T2TSearch::Dump(XTensor * output)
            }

            /* dump the sentence to the output tensor */
-            for(int w = 0; w < count; w++)
+            for (int w = 0; w < count; w++)
                output->Set3DInt(words[count - w - 1], h, beamSize - i - 1, w);
        }
    }
@@ -560,8 +558,8 @@ bool T2TSearch::IsEnd(int token)
 {
    CheckNTErrors(endSymbolNum > 0, "No end symbol?");

-    for(int i = 0; i < endSymbolNum; i++){
-        if(endSymbols[i] == token)
+    for (int i = 0; i < endSymbolNum; i++) {
+        if (endSymbols[i] == token)
            return true;
    }

@@ -573,17 +571,17 @@ set end symbols for search
 >> tokens - end symbols
 >> tokenNum - number of the end symbols
 */
-void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
+void T2TSearch::SetEnd(const int* tokens, const int tokenNum)
 {
-    if(endSymbols != NULL)
+    if (endSymbols != NULL)
        delete[] endSymbols;

-    if(tokenNum <= 0)
+    if (tokenNum <= 0)
        return;

    /* we may have multiple end symbols */
    tokens = new int[tokenNum];
-    for(int i = 0; i < tokenNum; i++)
+    for (int i = 0; i < tokenNum; i++)
        endSymbols[i] = tokens[i];
    endSymbolNum = tokenNum;
 }
@@ -592,9 +590,9 @@ void T2TSearch::SetEnd(const int * tokens, const int tokenNum)
 make a mask to prevent duplicated entries in beam expansion for the first position
 >> beam - the beam that keeps the searching states
 */
-XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
+XTensor T2TSearch::MakeFirstMask(T2TStateBundle* beam)
 {
-    XTensor &prob = beam->prob;
+    XTensor& prob = beam->prob;
    XTensor mask;

    int order = prob.order;
@@ -602,7 +600,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    for (int i = 0; i < order - 1; i++)
        dims[i] = prob.GetDim(i);

-    InitTensor(&mask, order - 1, dims, X_FLOAT);
+    InitTensorV2(&mask, order - 1, dims, X_FLOAT);
    mask.SetZeroAll();

    for (int i = 0; i < mask.unitNum; i++) {

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  */

@@ -51,10 +51,10 @@ private:
    int batchSize;

    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float> * fullHypos;
+    XHeap<MIN_HEAP, float>* fullHypos;

    /* array of the end symbols */
-    int * endSymbols;
+    int* endSymbols;

    /* number of the end symbols */
    int endSymbolNum;
@@ -70,40 +70,40 @@ public:
    ~T2TSearch();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output);

    /* preparation */
-    void Prepare(int myBatchSize,int myBeamSize);
+    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle * beam);
+    void Generate(T2TStateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle * beam);
+    void Collect(T2TStateBundle* beam);

    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle * beam);
+    void FillHeap(T2TStateBundle* beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output);
+    void Dump(XTensor* output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);

    /* set end symbols for search */
-    void SetEnd(const int * tokens, const int tokenNum);
+    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle * beam);
+    XTensor MakeFirstMask(T2TStateBundle* beam);
 };

 }

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  */

@@ -44,7 +44,7 @@ T2TTester::~T2TTester()
 }

 /* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
+void T2TTester::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
@@ -60,7 +60,7 @@ test the model
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
 {
    int wc = 0;
    int wordCount = 0;
@@ -94,8 +94,8 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        count++;
        wordCount = 0;
        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfCache[i].miss = true;
-            model->decoder->contextCache[i].miss = true;
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
        }

        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
@@ -103,14 +103,14 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        XTensor output;

        seacher.Search(model, &batchEnc, &paddingEnc, &output);
-
+        output.Dump(stderr);
        for (int i = 0; i < indices.size(); ++i) {
            Result res;
            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1D(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{i};
+            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
+            int idx[]{ i };
            srcIdx.SetData(idx, 1);
-            InitTensor(&tgtIdx, &srcIdx);
+            InitTensorV2(&tgtIdx, &srcIdx);
            SetAscendingOrder(tgtIdx, 0);

            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
@@ -153,7 +153,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTester::Dump(FILE * file, XTensor * output)
+void T2TTester::Dump(FILE* file, XTensor* output)
 {
    int seqLength = output->GetDim(-1);


--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  * A week with no trips :)
  */
@@ -56,13 +56,13 @@ public:
    ~T2TTester();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* test the model */
-    void Test(const char * fn, const char * ofn, T2TModel * model);
+    void Test(const char* fn, const char* ofn, T2TModel* model);

    /* dump the result into the file */
-    void Dump(FILE * file, XTensor * output);
+    void Dump(FILE* file, XTensor* output);
 };

 }

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
@@ -38,7 +38,7 @@ namespace transformer
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int TransformerMain(int argc, const char** argv);

 }


--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -28,6 +28,7 @@
 #include "XList.h"
 #include "XGlobal.h"

+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

@@ -363,6 +364,8 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
+template struct TensorListBase<uint64_t>;
 template struct TensorListBase<void*>;

+
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -26,6 +26,8 @@
 #include "XMem.h"
 #include "XGlobal.h"

+#include <cstdint>
+
 #ifndef __TensorList_H__
 #define __TensorList_H__

@@ -118,7 +120,14 @@ public:
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    T& operator[] (int i) { return GetItem(i); };
+    T& operator[] (int i) { 
+        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+        if (i < 0)
+            return items[count + i];
+        else
+            return items[i];
+    };
    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
 };
@@ -132,7 +141,7 @@ typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
-
+typedef TensorListBase<uint64_t> UInt64List;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  *
  * implementation of tensors used in this work. It it is the basis of XMatrix
  * and XVector
@@ -53,7 +53,7 @@

 #ifdef USE_CUDA

-// the CUDA stuff
+  // the CUDA stuff
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda.h>
@@ -64,7 +64,7 @@
 #endif

 /* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+namespace nts {

 int tensorIDGlobal = 0;
 MUTEX_HANDLE tensorMutex;
@@ -73,7 +73,7 @@ XTensor NULLTensor;
 /* generate a tensor id */
 int MakeTensorID()
 {
-    if(tensorIDGlobal == 0)
+    if (tensorIDGlobal == 0)
        MUTEX_INIT(tensorMutex);

    MUTEX_LOCK(tensorMutex);
@@ -97,7 +97,7 @@ XTensor::XTensor()
 }

 /* constructor */
-XTensor::XTensor(const XTensor * reference)
+XTensor::XTensor(const XTensor* reference)
 {
    Init();
    SetDataPointer();
@@ -112,7 +112,7 @@ constructor
 >> myDevID - device id
 >> myMem - memory pool used to allocating the data array
 */
-XTensor::XTensor(const int myOrder, int myDevID, XMem * myMem)
+XTensor::XTensor(const int myOrder, int myDevID, XMem* myMem)
 {
    CheckNTErrors((myOrder >= 0), "Illegal tensor order1");

@@ -134,8 +134,8 @@ constructor
 >> myDevID - device id
 >> myMem - memory pool used to allocating the data array
 */
-XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
-                 const float myDenseRatio, int myDevID, XMem * myMem)
+XTensor::XTensor(const int myOrder, const int* myDimSize, const TENSOR_DATA_TYPE myDataType,
+    const float myDenseRatio, int myDevID, XMem* myMem)
 {
    Init();
    SetDataPointer();
@@ -145,12 +145,12 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
    mem = myMem;
    devID = myMem != NULL ? myMem->devID : myDevID;

-    if(order >= 0)
+    if (order >= 0)
        Resize(myOrder, myDimSize, myDataType, myDenseRatio);
 }

 /* copy constructor */
-XTensor::XTensor(const XTensor &reference)
+XTensor::XTensor(const XTensor& reference)
 {
    Init();
    SetDataPointer();
@@ -159,7 +159,7 @@ XTensor::XTensor(const XTensor &reference)
    data = NULL;
    dataHost = NULL;

-    if(reference.isTmp){
+    if (reference.isTmp) {
        devID = reference.devID;
        mem = reference.mem;
        data = reference.data;
@@ -172,16 +172,16 @@ XTensor::XTensor(const XTensor &reference)
           This is VERY tricky and there might be better solutions :) */
        *reference.dataP = NULL;
    }
-    else{
+    else {
        devID = reference.devID;
        mem = reference.mem;
        InitTensorV2(this, &reference);
        _CopyValues(&reference, this);
    }

-    if(reference.isTmp)
+    if (reference.isTmp)
        XLink::Replace(&reference, this);
-    else{
+    else {
        CheckNTErrors(outgo.tailNum == 0, "The node has outgoing edge to other nodes!");
        XLink::CopyIncoming(&reference, this);
    }
@@ -191,7 +191,7 @@ XTensor::XTensor(const XTensor &reference)
 }

 /* copy constructor (with right value reference) */
-XTensor::XTensor(const XTensor &&reference)
+XTensor::XTensor(const XTensor&& reference)
 {
    Init();
    SetDataPointer();
@@ -225,12 +225,12 @@ XTensor::~XTensor()
       the connectivity of the graph. To kill memory
       leak, we release the data of the new tensor
       when its parent is deleted (see ClearIncoming). */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];

-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        data = NULL;
@@ -243,12 +243,12 @@ XTensor::~XTensor()

    DestroyData();

-    if(grad != NULL)
+    if (grad != NULL)
        delete grad;
 }

 /* set the name of the tensor */
-void XTensor::SetName(const char * myName)
+void XTensor::SetName(const char* myName)
 {
    strcpy(name, myName);
 }
@@ -280,7 +280,7 @@ void XTensor::Init()
    isTmp = false;
    isGrad = false;
    isVar = false;
-    enableGrad = true;
+    enableGrad = X_ENABLE_GRAD;
    visitMark = 0;
    grad = NULL;
 }
@@ -288,17 +288,17 @@ void XTensor::Init()
 /* delete data arrays */
 void XTensor::DestroyData()
 {
-    if(data != NULL && mem == NULL && !isShared)
+    if (data != NULL && mem == NULL && !isShared)
        XMemFree(devID, data);
-    else if(data != NULL && isInGlobalMem)
+    else if (data != NULL && isInGlobalMem)
        FreeData(this, mem);
-    else if(data != NULL)
+    else if (data != NULL)
        mem->Release(data, GetDataSizeInChar(), signature);

    data = NULL;

-    if(dataHost != NULL)
-        delete[] (char*)dataHost;
+    if (dataHost != NULL)
+        delete[](char*)dataHost;
    dataHost = NULL;
 }

@@ -307,7 +307,7 @@ shallow copy of the tensor
 Note that we do not copy data array here
 >> tensor - the source tensor
 */
-void XTensor::ShallowCopy(const XTensor &tensor)
+void XTensor::ShallowCopy(const XTensor& tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
@@ -330,12 +330,12 @@ XTensor& XTensor::operator= (const XTensor& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];

-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
@@ -350,35 +350,35 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        dataHost = NULL;
    }

-    if(false && !tensor.isTmp){
+    if (false && !tensor.isTmp) {
        /* NOTE: this might lead to additional data copy by Mac LLVM compilers */
        /* we make an identity transformation here */

-        if(outgo.tailNum > 0)
+        if (outgo.tailNum > 0)
            XLink::ClearOutgoing(this);
        XLink::ClearIncoming(this);

-        if(!_IsSameShaped(this, &tensor))
+        if (!_IsSameShaped(this, &tensor))
            Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);

        _Identity(&tensor, this);
        XLink::MakeLink(&tensor, NULL, this, FUNC_IDENTITY);
    }
-    else{
+    else {
        /* hard copy of the data array */
        int size = unitNum * unitSize;
-        if( isInit && !isSparse && !tensor.isSparse &&
+        if (isInit && !isSparse && !tensor.isSparse &&
            size == tensor.unitNum * tensor.unitSize &&
            ((devID < 0 && tensor.devID < 0) && devID == tensor.devID) &&
            data != NULL)
        {
            XMemCopy(data, devID, tensor.data, tensor.devID, size);
-            if(dataHost != NULL && tensor.dataHost != NULL)
+            if (dataHost != NULL && tensor.dataHost != NULL)
                XMemCopy(dataHost, -1, tensor.dataHost, tensor.devID, size);
        }
-        else{
+        else {
            DestroyData();
-            if(!isInit){
+            if (!isInit) {
                devID = tensor.devID;
                mem = tensor.mem;
            }
@@ -407,12 +407,12 @@ XTensor& XTensor::operator= (const XTensor&& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
-    if(outgo.tailNum > 0){
+    if (outgo.tailNum > 0) {
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];

-        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        XTensor* newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
        newTensor->SetTMPFlag();
        newTensor->data = data;
        newTensor->dataHost = dataHost;
@@ -500,7 +500,7 @@ XTensor XTensor::operator/ (const XTensor& tensor) const
 /* overloading of the division-sign */
 XTensor XTensor::operator/ (const DTYPE scale) const
 {
-    return ScaleAndShift(*this, (DTYPE)1/scale, 0);
+    return ScaleAndShift(*this, (DTYPE)1 / scale, 0);
 }

 /*
@@ -518,7 +518,7 @@ relocate the data on the target device
 >> myDevId - target device id
 >> myMem - memory pool on the target device
 */
-void XTensor::SetDevice(int myDevId, XMem * myMem)
+void XTensor::SetDevice(int myDevId, XMem* myMem)
 {
    if (myMem == NULL) {
        myMem = GMems.GetMem(myDevId);
@@ -527,9 +527,9 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
    isInGlobalMem = false;
 }

-bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
+bool XTensor::IsReduceShaped(const XTensor* a, const XTensor* b, int dim)
 {
-    if(a == NULL || b == NULL)
+    if (a == NULL || b == NULL)
        return false;

    if ((a->order - 1) != b->order)
@@ -541,18 +541,18 @@ bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
                return false;
        }
        else if (i >= dim) {
-            if (a->dimSize[i+1] != b->dimSize[i])
+            if (a->dimSize[i + 1] != b->dimSize[i])
                return false;
        }
    }

-    if(a->dataType != b->dataType)
+    if (a->dataType != b->dataType)
        return false;

-    if(a->denseRatio != b->denseRatio)
+    if (a->denseRatio != b->denseRatio)
        return false;

-    if(a->isSparse != b->isSparse)
+    if (a->isSparse != b->isSparse)
        return false;

    return true;
@@ -562,7 +562,7 @@ bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
 set the size of each dimension
 >> myDimSize - size of each dimension
 */
-void XTensor::SetDim(int * myDimSize)
+void XTensor::SetDim(int* myDimSize)
 {
    for (int i = 0; i < order; i++) {
        dimSize[i] = myDimSize[i];
@@ -579,7 +579,7 @@ int XTensor::GetDim(const int dim) const
    CheckNTErrors(dim >= -order, "dimenision is out of range!");

    int d = dim;
-    if(dim < 0)
+    if (dim < 0)
        d = order + dim;

    return dimSize[d];
@@ -590,12 +590,12 @@ reshape the tensor
 >> myOrder - order of the tensor
 >> myDimSize - size of each dimension
 */
-void XTensor::Reshape(const int myOrder, const int * myDimSize)
+void XTensor::Reshape(const int myOrder, const int* myDimSize)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    int num = 1;

-    for(int i = 0; i < myOrder; i++){
+    for (int i = 0; i < myOrder; i++) {
        num *= myDimSize[i];
        dims[i] = abs(myDimSize[i]);
    }
@@ -623,7 +623,7 @@ reshape the tensor into a matrix
 */
 void XTensor::Reshape(const int rowNum, const int colNum)
 {
-    int dims[2] = {rowNum, colNum};
+    int dims[2] = { rowNum, colNum };
    Reshape(2, dims);
 }

@@ -663,7 +663,7 @@ XTensor XTensor::TypeAs(const XTensor input)
 /* get the number of items in the data array */
 int XTensor::GetSize() const
 {
-    if(isSparse)
+    if (isSparse)
        return unitNumNonZero;
    else
        return unitNum;
@@ -672,13 +672,13 @@ int XTensor::GetSize() const
 /* get the size of the memory space used */
 int XTensor::GetDataSizeInChar() const
 {
-    if(isSparse){
+    if (isSparse) {
        int num = int(unitNum * denseRatio + 1);
-        int tupleSize = sizeof(int)+sizeof(DTYPE);
-        int size = sizeof(int) + tupleSize*(num);
+        int tupleSize = sizeof(int) + sizeof(DTYPE);
+        int size = sizeof(int) + tupleSize * (num);
        return size;
    }
-    else{
+    else {
        return unitNum * unitSize;
    }
 }
@@ -690,15 +690,15 @@ get unit size in terms of "dataType"
 */
 int XTensor::GetUnitSize(TENSOR_DATA_TYPE myDataType) const
 {
-    if(myDataType == X_INT)
+    if (myDataType == X_INT)
        return sizeof(int);
-    else if(myDataType == X_FLOAT)
+    else if (myDataType == X_FLOAT)
        return sizeof(float);
-    else if(myDataType == X_DOUBLE)
+    else if (myDataType == X_DOUBLE)
        return sizeof(double);
-    else if(myDataType == X_INT8)
+    else if (myDataType == X_INT8)
        return 1;
-    else if(myDataType == X_FLOAT16)
+    else if (myDataType == X_FLOAT16)
        return 2;
    return sizeof(float);
 }
@@ -737,21 +737,21 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
 a vector with all entries of 0
 >> stream - stream for the job pipeline
 */
-void XTensor::SetZeroAll(XStream * stream)
+void XTensor::SetZeroAll(XStream* stream)
 {
-    if(data == NULL)
+    if (data == NULL)
        return;

-    if(isSparse){
-        if(devID >= 0){
+    if (isSparse) {
+        if (devID >= 0) {
 #ifdef USE_CUDA
-            int size = sizeof(int) + (sizeof(int)+sizeof(DTYPE)) * unitNumNonZero;
+            int size = sizeof(int) + (sizeof(int) + sizeof(DTYPE)) * unitNumNonZero;

            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);

-            if(stream == NULL)
+            if (stream == NULL)
                cudaMemset(data, 0, size);
            else
                cudaMemsetAsync(data, 0, size, stream->stream);
@@ -764,14 +764,14 @@ void XTensor::SetZeroAll(XStream * stream)

        unitNumNonZero = 0;
    }
-    else{
-        if(devID >= 0){
+    else {
+        if (devID >= 0) {
 #ifdef USE_CUDA
            int devIDBackup = 0;
            cudaGetDevice(&devIDBackup);
            cudaSetDevice(devID);

-            if(stream == NULL)
+            if (stream == NULL)
                cudaMemset(data, 0, unitNum * unitSize);
            else
                cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
@@ -789,9 +789,9 @@ void XTensor::SetZeroAll(XStream * stream)
 >> num - number of data items
 >> beg - where we start the data copy in the data array of the tensor
 */
-void XTensor::SetData(const void * d, int num, int beg)
+void XTensor::SetData(const void* d, int num, int beg)
 {
-    if (data == NULL || d ==NULL)
+    if (data == NULL || d == NULL)
        return;

    CheckNTErrors(!isSparse, "TODO");
@@ -830,7 +830,7 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)

    // srand((unsigned)time(0));
    DTYPE variance = upper - lower;
-    void * d = NULL;
+    void* d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
@@ -851,10 +851,10 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
    SetData(d, unitNum);

    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
+        delete[](float*)d;
    }
    else {
-        delete[] (double*)d;
+        delete[](double*)d;
    }
 }

@@ -868,12 +868,12 @@ double GaussRand(DTYPE mean, DTYPE standardDeviation)
    double z;
    double pi = 3.141592654;

-    if (phase == 0){
+    if (phase == 0) {
        u = (rand() + 1.0) / (RAND_MAX + 1.0);
        v = (rand() + 1.0) / (RAND_MAX + 1.0);
-        z = sqrt(-2.0 * log(u))* sin(2.0 * pi * v);
+        z = sqrt(-2.0 * log(u)) * sin(2.0 * pi * v);
    }
-    else{
+    else {
        z = sqrt(-2.0 * log(u)) * cos(2.0 * pi * v);
    }

@@ -894,7 +894,7 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
        return;

    // srand((unsigned)time(0));
-    void * d = NULL;
+    void* d = NULL;
    if (dataType == X_FLOAT) {
        d = new float[unitNum];
        for (int i = 0; i < unitNum; i++) {
@@ -914,10 +914,10 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
    SetData(d, unitNum);

    if (dataType == X_FLOAT) {
-        delete[] (float*)d;
+        delete[](float*)d;
    }
    else {
-        delete[] (double*)d;
+        delete[](double*)d;
    }
 }

@@ -927,7 +927,7 @@ set tensor items with an array of offsets
 >> value - value for the data items
 >> num - number of the data items
 */
-void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
+void XTensor::SetDataBatched(MTYPE* offsets, DTYPE value, int num)
 {
    _SetDataWithOffset(this, offsets, value, num);
 }
@@ -938,7 +938,7 @@ set tensor items with an array of values
 >> values - value for each data item
 >> num - number of the data items
 */
-void XTensor::SetDataBatchedWithValues(MTYPE * offsets, void * values, int num)
+void XTensor::SetDataBatchedWithValues(MTYPE* offsets, void* values, int num)
 {
    _SetDataWithOffsetAndValue(this, offsets, values, num);
 }
@@ -974,7 +974,7 @@ DTYPE XTensor::Get(int offset) const
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
    CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");

-    DTYPE * address = (DTYPE*)data + offset;
+    DTYPE* address = (DTYPE*)data + offset;

    return ToCPU(devID, address);
 }
@@ -985,25 +985,25 @@ get the pointer to a cell
 >> size - size of index
 << return - pointer to the cell
 */
-void * XTensor::GetCell(int index[], int size) const
+void* XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");

    int offset = index[0];
-    for(int i = 1; i < size; ++i){
+    for (int i = 1; i < size; ++i) {
        CheckNTErrors((index[i] < dimSize[i]), "Index is out of range!");
        offset = offset * dimSize[i] + index[i];
    }

-    if(isSparse){
+    if (isSparse) {
        DTYPE value;
-        void * p;
-        if(BinarySearch(offset, value, p))
+        void* p;
+        if (BinarySearch(offset, value, p))
            return (char*)p + sizeof(int);
        else
            return NULL;
    }
-    else{
+    else {
        return ((char*)data) + offset * unitSize;
    }
 }
@@ -1017,8 +1017,8 @@ DTYPE XTensor::Get0D() const
    CheckNTErrors((order == 0), "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[1] = {0};
-    void * value = GetCell(dims, 0);
+    int dims[1] = { 0 };
+    void* value = GetCell(dims, 0);

    return ToCPU(devID, value);
 }
@@ -1034,8 +1034,8 @@ DTYPE XTensor::Get1D(int i) const
    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[1] = {i};
-    void * value = GetCell(dims, 1);
+    int dims[1] = { i };
+    void* value = GetCell(dims, 1);

    return ToCPU(devID, value);
 }
@@ -1053,8 +1053,8 @@ DTYPE XTensor::Get2D(int ni, int mi) const
    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
-    void * value = GetCell(dims, 2);
+    int dims[2] = { ni, mi };
+    void* value = GetCell(dims, 2);

    return ToCPU(devID, value);
 }
@@ -1073,8 +1073,8 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2) const
    CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
-    void * value = GetCell(dims, 3);
+    int dims[3] = { d0, d1, d2 };
+    void* value = GetCell(dims, 3);

    return ToCPU(devID, value);
 }
@@ -1090,7 +1090,7 @@ int XTensor::GetInt(int offset) const
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");
    CheckNTErrors(denseRatio == 1.0F, "Only dense tensors are supported in Get(offset).");

-    int * address = (int*)data + offset;
+    int* address = (int*)data + offset;

    return ToCPUInt(devID, address);
 }
@@ -1104,8 +1104,8 @@ int XTensor::Get0DInt() const
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");

-    int dims[1] = {0};
-    void * value = GetCell(dims, 0);
+    int dims[1] = { 0 };
+    void* value = GetCell(dims, 0);

    return ToCPUInt(devID, value);
 }
@@ -1121,8 +1121,8 @@ int XTensor::Get1DInt(int i) const
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in int type.");

-    int dims[1] = {i};
-    void * value = GetCell(dims, 1);
+    int dims[1] = { i };
+    void* value = GetCell(dims, 1);

    return ToCPUInt(devID, value);
 }
@@ -1133,15 +1133,15 @@ get the value of a cell in a 2d tensor in int type
 >> mi - column index
 << return - value of cell(ni, mi) in int
 */
- int XTensor::Get2DInt(int ni, int mi) const
+int XTensor::Get2DInt(int ni, int mi) const
 {
    CheckNTErrors(order == 2, "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors(ni >= 0 && ni < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
-    void * value = GetCell(dims, 2);
+    int dims[2] = { ni, mi };
+    void* value = GetCell(dims, 2);

    return ToCPUInt(devID, value);
 }
@@ -1161,8 +1161,8 @@ int XTensor::Get3DInt(int d0, int d1, int d2) const
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
-    void * value = GetCell(dims, 3);
+    int dims[3] = { d0, d1, d2 };
+    void* value = GetCell(dims, 3);

    return ToCPUInt(devID, value);
 }
@@ -1177,8 +1177,8 @@ DTYPE XTensor::GetInSparse(int i) const
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    char * d = (char*)data + sizeof(int);
-    DTYPE * value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));
+    char* d = (char*)data + sizeof(int);
+    DTYPE* value = (DTYPE*)(d + (sizeof(int) + sizeof(DTYPE)) * i + sizeof(int));

    return ToCPU(devID, value);
 }
@@ -1193,8 +1193,8 @@ int XTensor::GetKeyInSparse(int i) const
    CheckNTErrors(i >= 0 && i < unitNum, "Index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    char * d = (char*)data + sizeof(int);
-    int * key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);
+    char* d = (char*)data + sizeof(int);
+    int* key = (int*)(d + (sizeof(int) + sizeof(DTYPE)) * i);

    return ToCPUInt(devID, key);
 }
@@ -1222,7 +1222,7 @@ bool XTensor::Set(DTYPE value, int offset)
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");

-    DTYPE * d = (DTYPE*)data + offset;
+    DTYPE* d = (DTYPE*)data + offset;

    return SetToDevice(devID, d, value);
 }
@@ -1237,7 +1237,7 @@ bool XTensor::Set0D(DTYPE value)
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[1] = {0};
+    int dims[1] = { 0 };

    return SetToDevice(devID, GetCell(dims, 0), value);
 }
@@ -1254,7 +1254,7 @@ bool XTensor::Set1D(DTYPE value, int i)
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[1] = {i};
+    int dims[1] = { i };

    return SetToDevice(devID, GetCell(dims, 1), value);
 }
@@ -1273,7 +1273,7 @@ bool XTensor::Set2D(DTYPE value, int ni, int mi)
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[2] = {ni, mi};
+    int dims[2] = { ni, mi };

    return SetToDevice(devID, GetCell(dims, 2), value);
 }
@@ -1294,7 +1294,7 @@ bool XTensor::Set3D(DTYPE value, int d0, int d1, int d2)
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");

-    int dims[3] = {d0, d1, d2};
+    int dims[3] = { d0, d1, d2 };

    return SetToDevice(devID, GetCell(dims, 3), value);
 }
@@ -1309,7 +1309,7 @@ bool XTensor::SetInt(int value, int offset)
    CheckNTErrors(offset >= 0 && offset < unitNum, "Invalid index!");
    CheckNTErrors(data != NULL, "Cannot use an uninitialized tensor!");

-    int * d = (int*)data + offset;
+    int* d = (int*)data + offset;

    return SetToDeviceInt(devID, d, value);
 }
@@ -1339,7 +1339,7 @@ bool XTensor::Set0DInt(int value)
    CheckNTErrors(order == 0, "Cannot get a 0d cell for a tensor whose order is not 0!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[1] = {0};
+    int dims[1] = { 0 };

    return SetToDeviceInt(devID, GetCell(dims, 0), value);
 }
@@ -1356,7 +1356,7 @@ bool XTensor::Set1DInt(int value, int i)
    CheckNTErrors(i >= 0 && i < dimSize[0], "dimension 0 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[1] = {i};
+    int dims[1] = { i };

    return SetToDeviceInt(devID, GetCell(dims, 1), value);
 }
@@ -1375,7 +1375,7 @@ bool XTensor::Set2DInt(int value, int ni, int mi)
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "dimension 1 is out of range!");
    CheckNTErrors(dataType == X_INT, "The tensor is not in integer type.");

-    int dims[2] = {ni, mi};
+    int dims[2] = { ni, mi };

    return SetToDeviceInt(devID, GetCell(dims, 2), value);
 }
@@ -1396,7 +1396,7 @@ bool XTensor::Set3DInt(int value, int d0, int d1, int d2)
    CheckNTErrors(d2 >= 0 && d2 < dimSize[2], "dimension 2 is out of range!");
    CheckNTErrors((dataType == X_INT), "The tensor is not in integer type.");

-    int dims[3] = {d0, d1, d2};
+    int dims[3] = { d0, d1, d2 };

    return SetToDeviceInt(devID, GetCell(dims, 3), value);
 }
@@ -1408,15 +1408,15 @@ increase the value of a cell in a 2d tensor
 >> mi - column index
 << return - succeeded or not
 */
- bool XTensor::Add2D(DTYPE value, int ni, int mi)
+bool XTensor::Add2D(DTYPE value, int ni, int mi)
 {
    CheckNTErrors(ni >= 0 && ni < dimSize[0], "the row index is out of range!");
    CheckNTErrors(mi >= 0 && mi < dimSize[1], "the column index is out of range!");
    CheckNTErrors(dataType == DEFAULT_DTYPE, "The tensor is not in default type.");
    CheckNTErrors(isSparse == false, "TODO!");

-    if(devID < 0){
-        DTYPE * p = (DTYPE*)data + ni * dimSize[1] + mi;
+    if (devID < 0) {
+        DTYPE* p = (DTYPE*)data + ni * dimSize[1] + mi;

        CheckNTErrors((p != NULL), "No data array is found!");

@@ -1424,8 +1424,8 @@ increase the value of a cell in a 2d tensor

        return true;
    }
-    else{
-        int dims[2] = {ni, mi};
+    else {
+        int dims[2] = { ni, mi };
        return SetToDevice(devID, GetCell(dims, 2), Get2D(ni, mi) + value);
    }
 }
@@ -1433,24 +1433,24 @@ increase the value of a cell in a 2d tensor
 /* get the number of non-zero elements (in a sparse tensor) */
 int XTensor::GetNonzeroSize() const
 {
-    if(!isSparse){
+    if (!isSparse) {
        XPRINT(1, stderr, "WARNING! Counting non-zero elements in a dense tensor might be slow!\n");
        CheckNTErrors(devID < 0, "TODO");
-        if(dataType == DEFAULT_DTYPE){
+        if (dataType == DEFAULT_DTYPE) {
            int count = 0;
-            for(int i = 0; i < unitNum; i++){
+            for (int i = 0; i < unitNum; i++) {
                DTYPE value = *(DTYPE*)((char*)data + i * sizeof(DTYPE));
-                if(value == 0)
+                if (value == 0)
                    count++;
            }
            return count;
        }
-        else{
+        else {
            ShowNTErrors("TODO!");
            return -1;
        }
    }
-    else{
+    else {
        /* return the head of the tuple list */
        return unitNumNonZero;
    }
@@ -1481,7 +1481,7 @@ set the tensor as "variable"
 void XTensor::SetVarFlag(bool myIsVar)
 {
    isVar = myIsVar;
-    if(isVar)
+    if (isVar)
        SetGradFlag(true);
 }

@@ -1493,11 +1493,11 @@ resize a tensor with a specified tensor size
 >> myDenseRatio - how often an element has non-zero value
 << return - succeeded or not
 */
-bool XTensor::Resize(const int myOrder, const int * myDimSize, 
+bool XTensor::Resize(const int myOrder, const int* myDimSize,
    const TENSOR_DATA_TYPE myDataType, const float myDenseRatio)
 {
    /* free old mem */
-    if(data != NULL){
+    if (data != NULL) {
        if (mem == NULL)
            XMemFree(devID, data);
        else
@@ -1513,11 +1513,11 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,

    bool filledData = true;
    bool zeroData = false;
-    for(int i = 0; i < order; i++){
+    for (int i = 0; i < order; i++) {
        dimSize[i] = abs(myDimSize[i]);
-        if(myDimSize[i] < 0)
+        if (myDimSize[i] < 0)
            filledData = false;
-        if(myDimSize[i] == 0)
+        if (myDimSize[i] == 0)
            zeroData = true;
        unitNum *= dimSize[i];
    }
@@ -1528,17 +1528,17 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    dataType = myDataType;
    unitSize = GetUnitSize(dataType);

-    if(myDataType != DEFAULT_DTYPE)
+    if (myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;

-    if(zeroData){
+    if (zeroData) {
        unitNum = 0;
        return false;
    }

-    if(isSparse){
+    if (isSparse) {
        /*
        for sparse matrices, we use a list of tuple (key, value),
        ordered by key. Take a (2-dimensional) matrix as an example,
@@ -1557,21 +1557,21 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
        */

        int num = int(unitNum * denseRatio + 1);
-        int tupleSize = sizeof(int)+sizeof(DTYPE);
-        int size = sizeof(int) + tupleSize*(num);
+        int tupleSize = sizeof(int) + sizeof(DTYPE);
+        int size = sizeof(int) + tupleSize * (num);

-        if(filledData){
-            int * d = NULL;
+        if (filledData) {
+            int* d = NULL;

-            if(mem == NULL){
+            if (mem == NULL) {
                d = new int[size];
                memset(d, 0, size);
            }
-            else{
+            else {
                d = (int*)mem->Alloc(mem->devID, size);
            }

-            if(d == NULL)
+            if (d == NULL)
                return false;

 #if !defined(UNSAFE_BUT_FAST_MEM)
@@ -1581,10 +1581,10 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
        }
        return true;
    }
-    else{
-        if(filledData){
+    else {
+        if (filledData) {
            /* allocate the new one */
-            if(mem == NULL){
+            if (mem == NULL) {
                data = XMemAlloc(devID, unitNum * unitSize);
 #if defined(UNSAFE_BUT_FAST_MEM)
                XMemSet(devID, data, 0, unitNum * unitSize);
@@ -1593,12 +1593,12 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
            else
                data = (void*)mem->Alloc(mem->devID, unitNum * unitSize);

-            if(data == NULL)
+            if (data == NULL)
                return false;
        }

 #if !defined(UNSAFE_BUT_FAST_MEM)
-        if(data != NULL)
+        if (data != NULL)
            XMem::SetZero(data, unitNum * unitSize, mem);
 #endif
        return true;
@@ -1609,12 +1609,12 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
 resize a tensor by another
 >> myTensor - tensor for reference
 */
-bool XTensor::Resize(const XTensor * myTensor)
+bool XTensor::Resize(const XTensor* myTensor)
 {
    denseRatio = myTensor->denseRatio;
    TENSOR_DATA_TYPE myDataType = myTensor->dataType;

-    if(myDataType != DEFAULT_DTYPE)
+    if (myDataType != DEFAULT_DTYPE)
        isDefaultDType = false;
    else
        isDefaultDType = true;
@@ -1630,14 +1630,14 @@ binary search to find an element in a sparse tensor
              it is the previous one if there is no hit
 << return - found it or not?
 */
-bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
+bool XTensor::BinarySearch(int key, DTYPE& value, void*& position) const
 {
    CheckNTErrors((isSparse), "A sparse tensor is required!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");

-    int * d = (int*)data;
+    int* d = (int*)data;

-    if(key < 0 || *d == 0){
+    if (key < 0 || *d == 0) {
        value = 0;
        position = NULL;
        return false;
@@ -1647,37 +1647,37 @@ bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
    int high = *d - 1;
    int last = -1;
    bool ok = false;
-    int * k = NULL;
+    int* k = NULL;
    int headSize = sizeof(int);
-    int tupleSize = sizeof(int)+sizeof(DTYPE);
-    char * p = (char*)data + headSize;
+    int tupleSize = sizeof(int) + sizeof(DTYPE);
+    char* p = (char*)data + headSize;

-    while (low <= high){  
-        int mid = low + (high-low)/2;
+    while (low <= high) {
+        int mid = low + (high - low) / 2;
        k = (int*)(p + tupleSize * mid);
-        if (*k == key){
+        if (*k == key) {
            ok = true;
-            high = mid -1;
+            high = mid - 1;
            break;
        }
-        else if(*k > key){
-            high = mid -1;
+        else if (*k > key) {
+            high = mid - 1;
        }
-        else{
-            low = mid +1;
+        else {
+            low = mid + 1;
            last = mid;
        }
    }

-    if(ok){
-        DTYPE * p = (DTYPE*)((char*)k + sizeof(int));
+    if (ok) {
+        DTYPE* p = (DTYPE*)((char*)k + sizeof(int));
        value = *p;
        position = k;
        return true;
    }
-    else{
+    else {
        value = 0;
-        if(last == -1)
+        if (last == -1)
            position = NULL;
        else
            position = (char*)data + headSize + tupleSize * last;
@@ -1693,12 +1693,12 @@ dump data to a file
 >> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, const int verbose)
+void XTensor::Dump(FILE* file, const char* label, const int n, const int beg, const int verbose)
 {
    if (verbose > verboseLevel)
        return;

-    void * d = data;
+    void* d = data;
    bool isNewData = false;

 #ifdef USE_CUDA
@@ -1716,7 +1716,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                num *= dimSize[i];
            num = int(num * denseRatio + 1);
            int tupleSize = sizeof(int) + sizeof(DTYPE);
-            int size = sizeof(int) + tupleSize*(num);
+            int size = sizeof(int) + tupleSize * (num);

            d = new char[size];
            memset(d, 0, size);
@@ -1731,7 +1731,7 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
    if (label != NULL)
        fprintf(file, "%s ", label);

-    if(isInit){
+    if (isInit) {
        fprintf(file, "order=%d dimsize=", order);
        for (int i = 0; i < order; i++) {
            fprintf(file, "%d", dimSize[i]);
@@ -1739,21 +1739,21 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                fprintf(file, ",");
        }
    }
-    else{
+    else {
        fprintf(file, "order=-1 dimsize=-1");
    }

    fprintf(file, " dtype=%s dense=%f\n", GetDataTypeName(dataType), denseRatio);

-    if(!isInit){
+    if (!isInit) {
        fprintf(file, "NULL");
    }
    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for(int i = beg; i < end; i++){
+            for (int i = beg; i < end; i++) {
                DTYPE f = ((DTYPE*)d)[i];
-                if(i == beg)
+                if (i == beg)
                    fprintf(file, "%e", f);
                else
                    fprintf(file, " %e", f);
@@ -1762,9 +1762,9 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        }
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for(int i = beg; i < end; i++){
+            for (int i = beg; i < end; i++) {
                int f = ((int*)d)[i];
-                if(i == beg)
+                if (i == beg)
                    fprintf(file, "%d", f);
                else
                    fprintf(file, " %d", f);
@@ -1804,7 +1804,7 @@ dump data to a file
 >> beg - the first item id
 >> verbose - verbose level
 */
-void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
+void XTensor::Dump(const XTensor* tensor, FILE* file, const char* label, const int n, const int beg, const int verbose)
 {
    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
    _CopyValues(tensor, &a);
@@ -1836,7 +1836,7 @@ read data from a file
 >> file - where to load the data
 >> label - label of the tensor
 */
-void XTensor::Read(FILE * file, const char * label)
+void XTensor::Read(FILE* file, const char* label)
 {
    char typeName[32] = "";
    char dimSizeName[128] = "";
@@ -1869,7 +1869,7 @@ void XTensor::Read(FILE * file, const char * label)

    int o = 0;
    bool sameSize = true;
-    char * p = dimSizeName;
+    char* p = dimSizeName;
    while (*p != 0) {
        while (*p == ' ' || *p == '\t')
            p++;
@@ -1893,14 +1893,14 @@ void XTensor::Read(FILE * file, const char * label)
    if (!sameSize || dRatio > denseRatio || GetDataType(typeName) != dataType)
        Resize(dimNum, dims, GetDataType(typeName), dRatio);

-    void * dataBuf = XMemAlloc(-1, GetDataSizeInChar());
-    void * dataBackup = data;
+    void* dataBuf = XMemAlloc(-1, GetDataSizeInChar());
+    void* dataBackup = data;
    data = dataBuf;

    if (!isSparse) {
        if (dataType == DEFAULT_DTYPE) {
            for (int i = 0; i < unitNum; i++) {
-                DTYPE * f = ((DTYPE*)data) + i;
+                DTYPE* f = ((DTYPE*)data) + i;
                if (fscanf(file, "%e", f) < 1) {
                    ShowNTErrors("Incorrect tensor format!");
                }
@@ -1950,16 +1950,16 @@ read data from a binary file
 */
 void XTensor::BinaryRead(FILE* file, size_t offset)
 {
-    fseek(file, offset, 0);
+    //fseek(file, offset, 0);
    switch (dataType) {
    case X_INT: {
-        int * d = new int[unitNum];
+        int* d = new int[unitNum];
        fread(d, sizeof(int), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
    }
    default: {
-        float * d = new float[unitNum];
+        float* d = new float[unitNum];
        fread(d, sizeof(float), unitNum, file);
        SetData(d, unitNum);
        delete[] d;
@@ -1971,7 +1971,7 @@ void XTensor::BinaryRead(FILE* file, size_t offset)
 flush the data to the target device
 >> targetMem - memory pool on the target device
 */
-void XTensor::FlushToMem(XMem * targetMem)
+void XTensor::FlushToMem(XMem* targetMem)
 {
    if (targetMem == NULL)
        return;
@@ -1984,7 +1984,7 @@ void XTensor::FlushToMem(XMem * targetMem)
            CudaCPUToGPUFlush(&l, targetMem->devID, targetMem);
        }
        else if (mem != targetMem) {
-            void * tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
+            void* tmpData = targetMem->Alloc(targetMem->devID, GetDataSizeInChar());
            XMemCopy(tmpData, targetMem->devID, data, devID, GetDataSizeInChar());
            data = tmpData;
            mem = targetMem;
@@ -2013,24 +2013,24 @@ allocate the memory space of the tensor (in the global memory)
 >> myMem - the memory pool we are using
 >> useBuf - indicates whether we use the buffer in the memory pool
 */
-void XTensor::AllocateData(XTensor * tensor, XMem * myMem, bool useBuf)
+void XTensor::AllocateData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if(tensor == NULL)
+    if (tensor == NULL)
        return;

-    if(myMem == NULL){
-        if(tensor->data != NULL)
+    if (myMem == NULL) {
+        if (tensor->data != NULL)
            FreeData(tensor, NULL, false);
        tensor->data = XMemAlloc(tensor->devID, tensor->GetDataSizeInChar());
        tensor->isInGlobalMem = true;
    }
-    else{
+    else {
        CheckNTErrors((tensor->data == NULL), "Cannot renew the space for the tensor");
-        if(useBuf){
+        if (useBuf) {
            tensor->data = myMem->AllocBuf(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = false;
        }
-        else{
+        else {
            tensor->data = myMem->AllocGlobal(tensor->devID, tensor->GetDataSizeInChar());
            tensor->isInGlobalMem = true;
        }
@@ -2045,16 +2045,16 @@ free the memory space of the tensor (in the global memory)
 >> myMem - the memory pool we are using
 >> useBuf - indicates whether we use the buffer in the memory pool
 */
-void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
+void XTensor::FreeData(XTensor* tensor, XMem* myMem, bool useBuf)
 {
-    if(tensor == NULL)
+    if (tensor == NULL)
        return;

-    if(myMem == NULL){
+    if (myMem == NULL) {
        XMemFree(tensor->devID, tensor->data);
    }
-    else{
-        if(tensor->isInGlobalMem)
+    else {
+        if (tensor->isInGlobalMem)
            myMem->ReleaseGlobal(tensor->devID, tensor->data);
        else
            myMem->ReleaseBuf(tensor->devID, tensor->GetDataSizeInChar());
@@ -2065,27 +2065,27 @@ void XTensor::FreeData(XTensor * tensor, XMem * myMem, bool useBuf)
 }

 /* overloading of the plus-sign */
-XTensor operator+ (const DTYPE shift, const XTensor &tensor) 
+XTensor operator+ (const DTYPE shift, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, 1, shift);
 }

 /* overloading of the minus-sign */
-XTensor  operator- (const DTYPE shift, const XTensor &tensor)
+XTensor  operator- (const DTYPE shift, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, 1, -shift);
 }

 /* overloading of the multiply-sign */
-XTensor  operator* (const DTYPE scale, const XTensor &tensor)
+XTensor  operator* (const DTYPE scale, const XTensor& tensor)
 {
    return ScaleAndShift(tensor, scale, 0);
 }

 /* overloading of the division-sign */
-XTensor  operator/ (const DTYPE scale, const XTensor &tensor)
+XTensor  operator/ (const DTYPE scale, const XTensor& tensor)
 {
-    return ScaleAndShift(tensor, (DTYPE)1/scale, 0);
+    return ScaleAndShift(tensor, (DTYPE)1 / scale, 0);
 }

 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
                }                                                                                                   \
                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength);                                         \
                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
        else {                                                                                                      \
            /* data is separated */                                                                                 \
            for(int i = 0; i < blockNum; i++){                                                                      \
-                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                for(int j = 0; j < stride / 32; j++){                                                               \
                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
                    VectorBuffer vecBuf[4];                                                                         \

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
    int num = input->dimSize[dim];

    _ReduceSum(input, output, dim);
-    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
 }

 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
                    }
                    for(int j = 1; j < strideNum / 32; j++){
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
+                    for(int j = 0; j < stride / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        }
                        VectorBuffer vecBuf[4];
                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
-
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
                        }
                        for(int k = 1; k < strideNum; k++){
                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;