add support for greedy search

99097e41 · huchi · bfa6fc90 · 99097e41 · 99097e41 · 99097e41
Commit 99097e41 authored Feb 17, 2020 by huchi
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -19,6 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
 */

+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
 #include <stdio.h>
 #include "./network/XNet.h"
 #include "./tensor/XUtility.h"
@@ -27,9 +31,7 @@
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/Transformer.h"

-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
+

 using namespace nts;
 using namespace fnnlm;
@@ -37,19 +39,10 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    //_CrtSetBreakAlloc(2708);
+    /*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    _CrtSetBreakAlloc(2708);*/

    TransformerMain(argc - 1, argv + 1);
-    /*XTensor x;
-    InitTensor2D(&x, 2, 2);
-    float d[]{ 1,2,3,4 };
-    x.SetData(d, 4);
-    XTensor y;
-    y = ReduceSum(x, 0);
-    y.Dump(stderr);*/
-
-
    //_CrtDumpMemoryLeaks();
    
    return 0;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -34,7 +34,7 @@ T2TAttention::T2TAttention()
    nhead = -1;
    dk = -1;
    dv = -1;
-    d  = -1;
+    d = -1;
    isMasked = false;
    ignored = 0;
 }
@@ -62,7 +62,7 @@ void T2TAttention::InitModel(int argc, char** argv,

    float minmax = 0;

-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);
    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
@@ -70,15 +70,15 @@ void T2TAttention::InitModel(int argc, char** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
-    InitTensor1D(&bq, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
-    InitTensor2D(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID);
-    InitTensor1D(&ba, d, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bq, d, X_FLOAT, devID);
+    InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bk, d, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bv, d, X_FLOAT, devID);
+    InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
+    InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bo, d, X_FLOAT, devID);
 }

 /*
@@ -94,24 +94,27 @@ make the network
 >> cacheType - which type that cache is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
+XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
 {
    const bool isEnc = (!cache) ? true : false;
-
+    
    /* linear transformation before self-attention */
+    
    XTensor q2, k2, v2;
-    q2 = MatrixMul(q, X_NOTRANS, wq, X_TRANS) + bq;
+
+    q2 = MatrixMul(q, wq) + bq;

    if (!cache) {
        /* self attention for encoder layers */
-        k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-        v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
-        return MakeRPRAttention(k2, q2, v2,  mask, isTraining, isEnc);
+        k2 = MatrixMul(k, wk) + bk;
+        v2 = MatrixMul(v, wv) + bv;
+        return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
    }
+    
    else {
        if (cacheType == SELF_ATT) {
-            k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-            v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+            k2 = MatrixMul(k, wk) + bk;
+            v2 = MatrixMul(v, wv) + bv;

            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -121,12 +124,13 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
            cache->key = k2;
            cache->value = v2;
            cache->miss = false;
+
            return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
        }
        else if (cacheType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-                cache->value = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+                cache->key = MatrixMul(k, wk) + bk;
+                cache->value = MatrixMul(v, wv) + bv;
                cache->miss = false;
            }
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
@@ -134,50 +138,49 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
        CheckNTErrors(0, "invalid cache type");
    }
 }
-    
+
 /*
 make the attention network given keys, queries and values (after linear transformation)
 >> k - keys. It might be of size B * L * H
-       where B = batch size, L = sequence length,
-       and H = vector size of each position
+        where B = batch size, L = sequence length,
+        and H = vector size of each position
 >> q - queries
 >> v - values
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
+XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
 {
    XTensor kheads;
    XTensor qheads;
    XTensor vheads;
-    
+
    /* multi head */
    kheads = Split(k, k.order - 1, nhead);
    qheads = Split(q, q.order - 1, nhead);
    vheads = Split(v, v.order - 1, nhead);
-    
-    XTensor att; 
+
+    XTensor att;
    XTensor dot;
    XTensor scalar;
-    
+
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    
-    /*if (isMasked && mask) {
-        _SumMe(&dot, mask);
-    }*/
+    /*if (isMasked && mask)
+        _SumMe(&dot, mask);*/
    
    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
-    
+
    scalar = Softmax(dot, -1);

-    /*if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    att = BMMul(scalar, vheads);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 /*
@@ -215,34 +218,32 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
    InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);

    /* generate the relative emb index (L_q, L_kv) */
-    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID,is_encoder);
+    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);

-    
    /* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
    _Gather(&rp_embedding_k, &relative_key, &emb_matrix);

    /* RPR dot product (K, B, L_q, L_kv)*/
-    qheads = qheads / float(nhead);
-    
+
    RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);

    /*if (isMasked && mask)
        _SumMe(&dot, mask);*/

    /* scale the dot result */
-    //dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    /* softmax */
    scalar = Softmax(dot, -1);

-    /*if (isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    /* generate the relative attention output (K, B, L_q, H/K) */
    att = BMMul(scalar, vheads);
-
+    
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
@@ -251,10 +252,11 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    XTensor range;
    InitTensor1DV2(&range, len_kv, X_INT, devID);
    int* index = new int[len_kv];
+
    // for encoder self-attention which the L_q = L_kv
    if (is_encoder)
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = i;
        range.SetData(index, len_kv);
        XTensor range_2D, range_2D_t;
@@ -267,7 +269,7 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    // for decoder self-attention which the L_q != L_kv, and L_q is 1
    else
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = -len_kv + i + 1;
        range.SetData(index, len_kv);
        _Unsqueeze(&range, emb_matrix, 0, len_q);
@@ -299,7 +301,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    XTensor context;
    InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
    _MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
-    //if (profiler_) profiler_->FinishTimer("RPDotPro-BMM");

    // reshape and transpose x to (L_q, K*B, H/K or L_kv)
    int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
@@ -323,5 +324,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    relative_t.Reshape(4, split_dims);

    _Sum(&context, &relative_t, attention);
+
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -90,14 +90,18 @@ public:
    /* bias for V */
    XTensor bv;

+    XTensor wBig;
+
+    XTensor bBig;
+
    /* RPR emb */
    XTensor rp_embedding_k;

    /* transformation after dot-product attention */
-    XTensor wa;
+    XTensor wo;

    /* bias after dot-product attention */
-    XTensor ba;
+    XTensor bo;

    /* size of transformed Q and K */
    int dk;

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,27 +31,27 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
-    attLayerNorms = NULL;
-    attentionsEnde = NULL;
-    attEndeLayerNorms = NULL;
-    decodeLayerNorm = NULL;
-    selfCache = NULL;
-    contextCache = NULL;
+    selfAttLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
 }

 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
-    delete[] selfCache;
-    delete[] contextCache;
-    delete[] attentions;
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] attentionsEnde;
-    delete[] attEndeLayerNorms;
-    delete decodeLayerNorm;
+    delete[] selfAttLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    delete decoderLayerNorm;
 }

 /* 
@@ -71,7 +71,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;

-    LoadParamInt(argc, argv, "nlayer", &nlayer, 3);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
@@ -83,24 +83,24 @@ void AttDecoder::InitModel(int argc, char ** argv,
    /* embedding model */
    embedder.InitModel(argc, argv, devID, false);

-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
-    decodeLayerNorm = new T2TLN;
-    selfCache = new Cache[nlayer];
-    contextCache = new Cache[nlayer];
+    selfAttLayerNorms = new T2TLN[nlayer];
+    enDeAtt = new T2TAttention[nlayer];
+    enDeAttLayerNorms = new T2TLN[nlayer];
+    decoderLayerNorm = new T2TLN;
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
+        enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    decodeLayerNorm->InitModel(argc, argv, myDevID);
+    decoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -131,48 +131,38 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, X
        XTensor attNorm;

        /* layer normalization */
-        inputNorm = attLayerNorms[i].Make(x);
-        //inputNorm.Dump(stderr, "inputNorm", 10);
+        inputNorm = selfAttLayerNorms[i].Make(x);

        /******************/
        /* self attention */
-        att = attentions[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfCache[i], SELF_ATT);
+        att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);

        /* residual connection */
-        _SumMe(&att, &x);
-        //att.Dump(stderr, "Sum(att, x)", 10);
+        att = att + x;

        /* layer normalization */
-        attNorm = attEndeLayerNorms[i].Make(att);
-        //attNorm.Dump(stderr, "attNorm", 10);
+        attNorm = enDeAttLayerNorms[i].Make(att);

        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &contextCache[i], EN_DE_ATT);
-
-        //ende.Dump(stderr, "ende atten", 10);
+        ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);

        /* residual connection */
-        _SumMe(&ende, &att);
-        //res.Dump(stderr, "Sum(ende, att)", 10);
+        ende = ende + att;

        /* fnn */
        x = fnns[i].Make(ende, isTraining);
-        //x.Dump(stderr, "fnns[i]", 10);

    }

-    x = decodeLayerNorm->Make(x);
-    //x.Dump(stderr, "decodeLayerNorm", 10);
-    
-    x.SetName(DECODING_NAME);
+    x = decoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -63,13 +63,13 @@ public:
    T2TFNN * fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention * selfAtt;

    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    T2TLN * selfAttLayerNorms;

    /* layer normalization for decoder */
-    T2TLN * decodeLayerNorm;
+    T2TLN * decoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;
@@ -78,16 +78,16 @@ public:
    XTensor * output;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    T2TAttention * enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    T2TLN * enDeAttLayerNorms;

    /* layer cache list */
-    Cache* selfCache;
+    Cache* selfAttCache;

    /* layer cache list */
-    Cache* contextCache;
+    Cache* enDeAttCache;

 public:
    /* constructor */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "pad", &padIdx, 1);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    maxLength = maxLength + 1 + 1;
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
@@ -80,7 +80,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -113,47 +113,47 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input, int prevLen)
 {
-    /* assert padding index is 1 */
+    ///* assert padding index is 1 */

-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    //CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    //CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    //CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    //CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

-    
-    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
-
-    int* posData = new int[input.unitNum];
-
-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);
-
-
-    for (int i = 0; i < inputCPU.GetDim(0); i++) {
-        int startNoPad = 2 + prevLen - 1;
-        int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-        for (int j = 0; j < inputCPU.GetDim(1); j++) {
-            if (p[j] == 1) {
-                posData[i * inputCPU.GetDim(1) + j] = 1;
-            }
-            else {
-                posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-            }
-        }
-    }
+    //
+    //XTensor wordEmbedding, position, posEmbedding;
+    //InitTensor(&position, &input);

-    position.SetData(posData, position.unitNum);
-    delete[] posData;
+    //int* posData = new int[input.unitNum];

-    /* we make positional embeddings first */
-    if(true){
-        posEmbedding = Gather(posEmbeddingBase, position);
-    }
+    //XTensor inputCPU;
+    //InitTensorOnCPU(&inputCPU, &input);
+    //_CopyValues(&input, &inputCPU);

-    /* then we make word embeddings */

+    //for (int i = 0; i < inputCPU.GetDim(0); i++) {
+    //    int startNoPad = 2 + prevLen - 1;
+    //    int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
+    //    for (int j = 0; j < inputCPU.GetDim(1); j++) {
+    //        if (p[j] == 1) {
+    //            posData[i * inputCPU.GetDim(1) + j] = 1;
+    //        }
+    //        else {
+    //            posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
+    //        }
+    //    }
+    //}
+
+    //position.SetData(posData, position.unitNum);
+    //delete[] posData;
+
+    ///* we make positional embeddings first */
+    //if(true){
+    //    posEmbedding = Gather(posEmbeddingBase, position);
+    //}
+
+    /* then we make word embeddings */
+    XTensor wordEmbedding;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -29,7 +29,7 @@ using namespace nts;
 namespace transformer
 {

-#define DEFAULT_EMBEDDING_SIZE 512
+#define DEFAULT_EMBEDDING_SIZE 128

 /* 
 embedding (of word at position i):

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -34,7 +34,7 @@ AttEncoder::AttEncoder()
    attentions = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
-    encodeLayerNorm = NULL;
+    encoderLayerNorm = NULL;
 }

 /* de-constructor */
@@ -43,7 +43,7 @@ AttEncoder::~AttEncoder()
    delete[] attentions;
    delete[] fnns;
    delete[] attLayerNorms;
-    delete encodeLayerNorm;
+    delete encoderLayerNorm;
 }

 /* 
@@ -61,7 +61,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;
    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 35);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
@@ -76,7 +76,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
-    encodeLayerNorm = new T2TLN;
+    encoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
@@ -84,7 +84,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
        fnns[i].InitModel(argc, argv, myDevID);
        attLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    encodeLayerNorm->InitModel(argc, argv, myDevID);
+    encoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -123,13 +123,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, boo

        /* fnn */
        x = fnns[i].Make(res, isTraining);
-
    }

-    x = encodeLayerNorm->Make(x);
-
-    x.SetName(ENCODING_NAME);
-    input.SetName(ENCODING_INPUT_NAME);
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -93,11 +93,11 @@ public:
    /* attention model of each layer */
    T2TAttention * attentions;

-    /* layer normalization for attention */
+    /* layer normalizations for attention */
    T2TLN * attLayerNorms;

    /* layer normalization for encoder */
-    T2TLN * encodeLayerNorm;
+    T2TLN * encoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
+  */

 #include <math.h>
 #include "T2TFNN.h"
@@ -32,9 +32,9 @@ namespace transformer
 /* constructor */
 T2TFNN::T2TFNN()
 {
-    inSize  = -1;
+    inSize = -1;
    outSize = -1;
-    hSize   = -1;
+    hSize = -1;
 }

 /* deconstructor */
@@ -42,28 +42,28 @@ T2TFNN::~T2TFNN()
 {
 }

-/* 
-initialize the model 
+/*
+initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
+void T2TFNN::InitModel(int argc, char** argv, int myDevID)
 {
    devID = myDevID;
-    
+
    float minmax = 0;

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2DV2(&w1, hSize, inSize, X_FLOAT, devID);
+    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2DV2(&w2, outSize, hSize, X_FLOAT, devID);
+    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    fnnLayerNorm.InitModel(argc, argv, myDevID);
@@ -78,25 +78,25 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
    //b2.SetZeroAll();
 }

-/* 
-make the network 
+/*
+make the network
 y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
->> return - the output tensor 
+>> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), X_NOTRANS, w1, X_TRANS, b1));
-    
-    if(isTraining && dropoutP > 0)
+    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));
+
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
    XTensor res;
-    res = MulAndShift(t1, X_NOTRANS, w2, X_TRANS, b2);
+    res = MulAndShift(t1, w2, b2);
    _SumMe(&res, &input);
    return  res;
 }

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -53,8 +53,8 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID)
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
 }

 /*
@@ -78,7 +78,7 @@ XTensor T2TLN::Make(XTensor &input)
    mean = ReduceMean(x, x.order - 1);

    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean);
+    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);
@@ -92,7 +92,7 @@ XTensor T2TLN::Make(XTensor &input)
    xn = (x - meanFilled) / standardFilled;

    /* result = x' * w + b   */
-    return xn * w + b;
+    return xn *  w + b;
 }

 }
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -103,7 +103,7 @@ public:
    /* read the parameters */
    void Read(const char * fn);
 };
-void FastRead(XTensor* x, FILE* f);
+
 }

 #endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,13 +56,11 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
 }


-
 /* 
 make the network (redefined output tensor) 
 >> input - input tensor
@@ -72,9 +70,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_NOTRANS), -1);
-
-    output.SetName(OUTPUT_NAME);
+    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
 }

 }
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+  */

 #include "T2TPredictor.h"
 #include "../../tensor/core/CHeader.h"
@@ -38,24 +38,24 @@ T2TStateBundle::T2TStateBundle()
 /* de-constructor */
 T2TStateBundle::~T2TStateBundle()
 {
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
 }

-/* 
-create states 
+/*
+create states
 >> num - number of states
 */
 void T2TStateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

-    if(states != NULL)
+    if (states != NULL)
        delete[] states;

    states = new T2TState[num];

-    for(int i = 0; i < num; i++){
+    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
        states[i].pid = T2T_PID_EMPTY;
        states[i].isEnd = false;
@@ -74,7 +74,7 @@ void T2TStateBundle::MakeStates(int num)
 /* constructor */
 T2TPredictor::T2TPredictor()
 {
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
@@ -82,37 +82,44 @@ T2TPredictor::~T2TPredictor()
 {
 }

-/* 
-create an initial state 
+/*
+create an initial state
 >> model - the t2t model
 >> top - the top-most layer of the network
 >> input - input of the network
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);

-    float* data = new float[state->probPath.unitNum];
+    /*float* data = new float[state->probPath.unitNum];
    for (int i = 0; i < state->probPath.unitNum; ++i) {
        data[i] = -1e20F;
        if (i % beamSize == 0)
            data[i] = 0;
    }
    state->probPath.SetData(data, state->probPath.unitNum);
+    delete[] data;*/

+    SetDataFixed(state->probPath, -1e9F);
+    for (int i = 0; i < state->probPath.unitNum; ++i) {
+        if (i % beamSize == 0)
+            state->probPath.Set(0.0F, i);
+    }
+    
    state->nstep.SetZeroAll();
    state->endMark.SetZeroAll();

-    delete[] data;
+    
    state->stateNum = 0;
 }

@@ -125,15 +132,15 @@ void T2TPredictor::SetStartSymbol(int symbol)
    startSymbol = symbol;
 }

-/* 
-read a state 
+/*
+read a state
 >> model - the t2t model that keeps the network created so far
 >> state - a set of states. It keeps
             1) hypotheses (states)
             2) probablities of hypotheses
             3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
 {
    m = model;
    s = state;
@@ -147,8 +154,7 @@ predict the next state
 >> paddingEnc - padding of the encoder
 >>> isStart - is the start or not
 */
-void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
-                           XTensor * inputEnc, XTensor * paddingEnc, bool isStart)
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -157,42 +163,43 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,

    /* the first token */
    XTensor first;
-    
+
    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for(int i = 0; i < inputEnc->order - 1; i++)
+    for (int i = 0; i < inputEnc->order - 1; i++)
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
    SetDataFixedInt(first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
    if (isStart) {
        inputDec = Identity(first);
    }
-    else{
+    else {
        /* only pass one step to the decoder */
        inputDec = GetLastPrediction(s);
        inputDec.SetDevice(inputEnc->devID);
    }
+    

    /* prediction probabilities */
-    XTensor &output = next->prob;
+    XTensor& output = next->prob;
    XTensor decoding;
-    
-    for(int i = 0; i < inputDec.order - 1; i++)
+
+    for (int i = 0; i < inputDec.order - 1; i++)
        dims[i] = inputDec.GetDim(i);
    dims[inputDec.order - 1] = inputDec.GetDim(-1);
-    
+
    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);
-    
+
    XTensor maskDec;
    XTensor maskEncDec;
-    
+
    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);
+    //m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);

    /* make the decoding network */
    decoding = m->decoder->Make(inputDec, *encoding, NULL, maskEncDec, false);
@@ -203,38 +210,38 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    m->outputLayer->Make(decoding, output);
 }

-/* 
-generate paths up to the states of the current step 
+/*
+generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;
-    
-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            cur = cur->last;
        }

-        if(nsteps > distance)
+        if (nsteps > distance)
            distance = nsteps;
    }

    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            path.Set2DInt(cur->prediction, i, distance - nsteps);
            cur = cur->last;
@@ -253,7 +260,7 @@ XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    XTensor lastPred;
-    InitTensor2D(&lastPred, state->stateNum, 1, X_INT);
+    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);

    for (int i = 0; i < state->stateNum; i++) {
        T2TState* cur = state->states + i;

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -15,10 +15,10 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
- * This is the first source file I create in 2019 - new start!
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+  * This is the first source file I create in 2019 - new start!
+  */

 #ifndef __T2TPREDICTOR_H__
 #define __T2TPREDICTOR_H__
@@ -39,8 +39,8 @@ public:
    /* we assume that the prediction is an integer */
    int prediction;

-    /* id of the problem. One can regard it as the sentence id when we 
-       translate a number of sentences in the batched manner. The hypothesis 
+    /* id of the problem. One can regard it as the sentence id when we
+       translate a number of sentences in the batched manner. The hypothesis
       is empty if id = -1 */
    int pid;

@@ -66,7 +66,7 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState * last;
+    T2TState* last;
 };

 /* a bundle of states */
@@ -75,7 +75,7 @@ class T2TStateBundle
 public:
    /* predictions */
    XTensor prediction;
-    
+
    /* id of the previous state that generates the current one  */
    XTensor preID;

@@ -95,7 +95,7 @@ public:
    XTensor nstep;

    /* list of states */
-    T2TState * states;
+    T2TState* states;

    /* number of states */
    int stateNum;
@@ -114,19 +114,19 @@ public:
    void MakeStates(int num);
 };

-/* The predictor reads the current state and then predicts the next. 
+/* The predictor reads the current state and then predicts the next.
   It is exactly the same procedure of MT inference -
   we get the state of previous words and then generate the next word.
-   Here, a state can be regared as the representation of words (word 
+   Here, a state can be regared as the representation of words (word
   indices, hidden states, embeddings and etc.).  */
 class T2TPredictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel * m;
+    T2TModel* m;

    /* current state */
-    T2TStateBundle * s;
+    T2TStateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,19 +139,19 @@ public:
    ~T2TPredictor();

    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel * model, T2TStateBundle * state);
+    void Read(T2TModel* model, T2TStateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc, bool isStart);
+    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle * state);
+    XTensor GeneratePaths(T2TStateBundle* state);

    /* get the predictions of the previous step */
    XTensor GetLastPrediction(T2TStateBundle* state);

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  */

 #ifndef __T2TSEARCH_H__
 #define __T2TSEARCH_H__
@@ -40,10 +40,10 @@ private:

    /* predictor */
    T2TPredictor predictor;
-    
+
    /* max length of the generated sequence */
    int maxLength;
-    
+
    /* beam size */
    int beamSize;

@@ -51,10 +51,10 @@ private:
    int batchSize;

    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float> * fullHypos;
+    XHeap<MIN_HEAP, float>* fullHypos;

    /* array of the end symbols */
-    int * endSymbols;
+    int* endSymbols;

    /* number of the end symbols */
    int endSymbolNum;
@@ -68,42 +68,42 @@ public:

    /* de-constructor */
    ~T2TSearch();
-    
+
    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output);

    /* preparation */
-    void Prepare(int myBatchSize,int myBeamSize);
+    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle * beam);
+    void Generate(T2TStateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle * beam);
+    void Collect(T2TStateBundle* beam);

    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle * beam);
+    void FillHeap(T2TStateBundle* beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output);
+    void Dump(XTensor* output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);

    /* set end symbols for search */
-    void SetEnd(const int * tokens, const int tokenNum);
+    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle * beam);
+    XTensor MakeFirstMask(T2TStateBundle* beam);
 };

 }

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -15,9 +15,9 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  */

 #include <math.h>
 #include "T2TUtility.h"
@@ -44,23 +44,23 @@ T2TTester::~T2TTester()
 }

 /* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
+void T2TTester::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
    LoadParamInt(argc, argv, "sentbatch", &sentBatch, 1);
    LoadParamBool(argc, argv, "sort", &batchLoader.sortBuffer, true);
-    
+
    seacher.Init(argc, argv);
 }

-/* 
+/*
 test the model
 >> fn - test data file
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
 {
    int wc = 0;
    int wordCount = 0;
@@ -86,7 +86,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    int* seqs = new int[MILLION];

    batchLoader.Init(fn);
-    
+

    int count = 0;
    while (!batchLoader.IsEmpty())
@@ -94,23 +94,23 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        count++;
        wordCount = 0;
        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfCache[i].miss = true;
-            model->decoder->contextCache[i].miss = true;
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
        }

        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
-        
+
        XTensor output;

        seacher.Search(model, &batchEnc, &paddingEnc, &output);
-
+        output.Dump(stderr);
        for (int i = 0; i < indices.size(); ++i) {
            Result res;
            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1D(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{i};
+            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
+            int idx[]{ i };
            srcIdx.SetData(idx, 1);
-            InitTensor(&tgtIdx, &srcIdx);
+            InitTensorV2(&tgtIdx, &srcIdx);
            SetAscendingOrder(tgtIdx, 0);

            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
@@ -127,9 +127,9 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)

        if (batchCount % 1 == 0) {
            double elapsed = GetClockSec() - startT;
-            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
-                    elapsed, sentCount, wordCount);
+            XPRINT3(0, stderr,
+                "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
+                elapsed, sentCount, wordCount);
        }
    }

@@ -138,11 +138,11 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    for (auto res : batchLoader.resBuffer) {
        Dump(ofile, &res.values);
    }
-        
+
    fclose(ofile);

    delete[] seqs;
-    
+
    double elapsed = GetClockSec() - startT;

    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d)\n", elapsed, wordCountTotal, sentCount);
@@ -153,7 +153,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTester::Dump(FILE * file, XTensor * output)
+void T2TTester::Dump(FILE* file, XTensor* output)
 {
    int seqLength = output->GetDim(-1);


--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -15,10 +15,10 @@
 * limitations under the License.
 */

-/*
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
- * A week with no trips :)
- */
+ /*
+  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+  * A week with no trips :)
+  */

 #ifndef __T2TTESTER_H__
 #define __T2TTESTER_H__
@@ -41,7 +41,7 @@ public:

    /* batch size for sentences */
    int sentBatch;
-    
+
    /* for batching */
    DataSet batchLoader;

@@ -56,13 +56,13 @@ public:
    ~T2TTester();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* test the model */
-    void Test(const char * fn, const char * ofn, T2TModel * model);
+    void Test(const char* fn, const char* ofn, T2TModel* model);

    /* dump the result into the file */
-    void Dump(FILE * file, XTensor * output);
+    void Dump(FILE* file, XTensor* output);
 };

 }

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
@@ -38,7 +38,7 @@ namespace transformer
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int TransformerMain(int argc, const char** argv);

 }


--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -28,6 +28,7 @@
 #include "XList.h"
 #include "XGlobal.h"

+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

@@ -363,6 +364,8 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
+template struct TensorListBase<uint64_t>;
 template struct TensorListBase<void*>;

+
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -26,6 +26,8 @@
 #include "XMem.h"
 #include "XGlobal.h"

+#include <cstdint>
+
 #ifndef __TensorList_H__
 #define __TensorList_H__

@@ -118,7 +120,14 @@ public:
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    T& operator[] (int i) { return GetItem(i); };
+    T& operator[] (int i) { 
+        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+        if (i < 0)
+            return items[count + i];
+        else
+            return items[i];
+    };
    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
 };
@@ -132,7 +141,7 @@ typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
-
+typedef TensorListBase<uint64_t> UInt64List;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
                }                                                                                                   \
                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength);                                         \
                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
        else {                                                                                                      \
            /* data is separated */                                                                                 \
            for(int i = 0; i < blockNum; i++){                                                                      \
-                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                for(int j = 0; j < stride / 32; j++){                                                               \
                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
                    VectorBuffer vecBuf[4];                                                                         \

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
    int num = input->dimSize[dim];

    _ReduceSum(input, output, dim);
-    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
 }

 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
                    }
                    for(int j = 1; j < strideNum / 32; j++){
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
+                    for(int j = 0; j < stride / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        }
                        VectorBuffer vecBuf[4];
                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
-
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
                        }
                        for(int k = 1; k < strideNum; k++){
                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;