add support for greedy search

99097e41 · huchi · bfa6fc90 · 99097e41 · 99097e41 · 99097e41
Commit 99097e41 authored Feb 17, 2020 by huchi
--- a/source/Main.cpp
+++ b/source/Main.cpp
@@ -19,6 +19,10 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-10
 */

+//#define CRTDBG_MAP_ALLOC
+//#include <stdlib.h>
+//#include <crtdbg.h>
+
 #include <stdio.h>
 #include "./network/XNet.h"
 #include "./tensor/XUtility.h"
@@ -27,9 +31,7 @@
 #include "./sample/fnnlm/FNNLM.h"
 #include "./sample/transformer/Transformer.h"

-//#define CRTDBG_MAP_ALLOC
-//#include <stdlib.h>
-//#include <crtdbg.h>
+

 using namespace nts;
 using namespace fnnlm;
@@ -37,19 +39,10 @@ using namespace transformer;

 int main( int argc, const char ** argv )
 {
-    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
-    //_CrtSetBreakAlloc(2708);
+    /*_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
+    _CrtSetBreakAlloc(2708);*/

    TransformerMain(argc - 1, argv + 1);
-    /*XTensor x;
-    InitTensor2D(&x, 2, 2);
-    float d[]{ 1,2,3,4 };
-    x.SetData(d, 4);
-    XTensor y;
-    y = ReduceSum(x, 0);
-    y.Dump(stderr);*/
-
-
    //_CrtDumpMemoryLeaks();
    
    return 0;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -62,7 +62,7 @@ void T2TAttention::InitModel(int argc, char** argv,

    float minmax = 0;

-    LoadParamInt(argc, argv, "nhead", &nhead, 8);
+    LoadParamInt(argc, argv, "nhead", &nhead, 4);
    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
@@ -70,15 +70,15 @@ void T2TAttention::InitModel(int argc, char** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wq, d, d, X_FLOAT, devID);
-    InitTensor1D(&bq, d, X_FLOAT, devID);
-    InitTensor2D(&wk, d, d, X_FLOAT, devID);
-    InitTensor1D(&bk, d, X_FLOAT, devID);
-    InitTensor2D(&wv, d, d, X_FLOAT, devID);
-    InitTensor1D(&bv, d, X_FLOAT, devID);
-    InitTensor2D(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID);
-    InitTensor1D(&ba, d, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bq, d, X_FLOAT, devID);
+    InitTensor2DV2(&wk, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bk, d, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bv, d, X_FLOAT, devID);
+    InitTensor2DV2(&rp_embedding_k, max_relative_position * 2 + 1, d/nhead, X_FLOAT, devID);
+    InitTensor2DV2(&wo, d, d, X_FLOAT, devID);
+    InitTensor1DV2(&bo, d, X_FLOAT, devID);
 }

 /*
@@ -94,24 +94,27 @@ make the network
 >> cacheType - which type that cache is
 << return - multi-attention result
 */
-XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
+XTensor T2TAttention::Make(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, Cache* cache, int cacheType)
 {
    const bool isEnc = (!cache) ? true : false;
    
    /* linear transformation before self-attention */
+    
    XTensor q2, k2, v2;
-    q2 = MatrixMul(q, X_NOTRANS, wq, X_TRANS) + bq;
+
+    q2 = MatrixMul(q, wq) + bq;

    if (!cache) {
        /* self attention for encoder layers */
-        k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-        v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+        k2 = MatrixMul(k, wk) + bk;
+        v2 = MatrixMul(v, wv) + bv;
        return MakeRPRAttention(k2, q2, v2, mask, isTraining, isEnc);
    }
+    
    else {
        if (cacheType == SELF_ATT) {
-            k2 = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-            v2 = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+            k2 = MatrixMul(k, wk) + bk;
+            v2 = MatrixMul(v, wv) + bv;

            /* if hit, we only concat the cache with the new token */
            if (!cache->miss) {
@@ -121,12 +124,13 @@ XTensor T2TAttention::Make( XTensor& k,  XTensor& q,  XTensor& v, XTensor* mask,
            cache->key = k2;
            cache->value = v2;
            cache->miss = false;
+
            return MakeRPRAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
        }
        else if (cacheType == EN_DE_ATT) {
            if (cache->miss) {
-                cache->key = MatrixMul(k, X_NOTRANS, wk, X_TRANS) + bk;
-                cache->value = MatrixMul(v, X_NOTRANS, wv, X_TRANS) + bv;
+                cache->key = MatrixMul(k, wk) + bk;
+                cache->value = MatrixMul(v, wv) + bv;
                cache->miss = false;
            }
            return MakeAttention(cache->key, q2, cache->value, mask, isTraining, isEnc);
@@ -145,7 +149,7 @@ make the attention network given keys, queries and values (after linear transfor
 >> mask - as it is
 >> isTraining - indicates whether the model is used for training
 */
-XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
+XTensor T2TAttention::MakeAttention(XTensor& k, XTensor& q, XTensor& v, XTensor* mask, bool isTraining, bool is_encoder)
 {
    XTensor kheads;
    XTensor qheads;
@@ -163,21 +167,20 @@ XTensor T2TAttention::MakeAttention(XTensor &k, XTensor& q, XTensor& v, XTensor*
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    
-    /*if (isMasked && mask) {
-        _SumMe(&dot, mask);
-    }*/
+    /*if (isMasked && mask)
+        _SumMe(&dot, mask);*/
    
    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    scalar = Softmax(dot, -1);

-    /*if(isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    att = BMMul(scalar, vheads);

    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 /*
@@ -215,14 +218,12 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
    InitTensor4DV2(&dot, nhead, batch_size, len_q, len_kv, X_FLOAT, q.devID);

    /* generate the relative emb index (L_q, L_kv) */
-    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID,is_encoder);
-
+    GetRPEmbedding(&emb_matrix, len_q, len_kv, max_relative_position, q.devID, is_encoder);

    /* generate the relative key from the rp_embedding_k (L_q, L_kv, H/K) */
    _Gather(&rp_embedding_k, &relative_key, &emb_matrix);

    /* RPR dot product (K, B, L_q, L_kv)*/
-    qheads = qheads / float(nhead);

    RPDotProduct(&qheads, &kheads, &relative_key, &dot, true);

@@ -230,19 +231,19 @@ XTensor T2TAttention::MakeRPRAttention(XTensor& k, XTensor& q, XTensor& v, XTens
        _SumMe(&dot, mask);*/

    /* scale the dot result */
-    //dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F / (float)sqrt((float)dk / nhead));

    /* softmax */
    scalar = Softmax(dot, -1);

-    /*if (isTraining && dropoutP > 0)
-        scalar = Dropout(scalar, dropoutP);*/
+    if (isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);

    /* generate the relative attention output (K, B, L_q, H/K) */
    att = BMMul(scalar, vheads);
    
    /* concatenate the heads */
-    return MulAndShift(Merge(att, att.order - 1), X_NOTRANS, wa, X_TRANS, ba);
+    return MulAndShift(Merge(att, att.order - 1), wo, bo);
 }

 void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const int len_kv, const int max_relative_length, const int devID, const bool is_encoder)
@@ -251,10 +252,11 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    XTensor range;
    InitTensor1DV2(&range, len_kv, X_INT, devID);
    int* index = new int[len_kv];
+
    // for encoder self-attention which the L_q = L_kv
    if (is_encoder)
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = i;
        range.SetData(index, len_kv);
        XTensor range_2D, range_2D_t;
@@ -267,7 +269,7 @@ void T2TAttention::GetRPEmbedding(XTensor* emb_matrix, const int len_q, const in
    // for decoder self-attention which the L_q != L_kv, and L_q is 1
    else
    {
-        for (int i = 0; i <len_kv; i++)
+        for (int i = 0; i < len_kv; i++)
            index[i] = -len_kv + i + 1;
        range.SetData(index, len_kv);
        _Unsqueeze(&range, emb_matrix, 0, len_q);
@@ -299,7 +301,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    XTensor context;
    InitTensor4DV2(&context, head_num, batch_size, len_q, last_dim, X_FLOAT, x->devID);
    _MatrixMulBatched(x, X_NOTRANS, y, transpose_flag, &context);
-    //if (profiler_) profiler_->FinishTimer("RPDotPro-BMM");

    // reshape and transpose x to (L_q, K*B, H/K or L_kv)
    int merge_dims[] = { head_num * batch_size, len_q, x->dimSize[3] };
@@ -323,5 +324,6 @@ void T2TAttention::RPDotProduct(XTensor* x, XTensor* y, XTensor* z, XTensor* att
    relative_t.Reshape(4, split_dims);

    _Sum(&context, &relative_t, attention);
+
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -90,14 +90,18 @@ public:
    /* bias for V */
    XTensor bv;

+    XTensor wBig;
+
+    XTensor bBig;
+
    /* RPR emb */
    XTensor rp_embedding_k;

    /* transformation after dot-product attention */
-    XTensor wa;
+    XTensor wo;

    /* bias after dot-product attention */
-    XTensor ba;
+    XTensor bo;

    /* size of transformed Q and K */
    int dk;

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,27 +31,27 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
-    attentions = NULL;
+    selfAtt = NULL;
    fnns = NULL;
-    attLayerNorms = NULL;
-    attentionsEnde = NULL;
-    attEndeLayerNorms = NULL;
-    decodeLayerNorm = NULL;
-    selfCache = NULL;
-    contextCache = NULL;
+    selfAttLayerNorms = NULL;
+    enDeAtt = NULL;
+    enDeAttLayerNorms = NULL;
+    decoderLayerNorm = NULL;
+    selfAttCache = NULL;
+    enDeAttCache = NULL;
 }

 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
-    delete[] selfCache;
-    delete[] contextCache;
-    delete[] attentions;
+    delete[] selfAttCache;
+    delete[] enDeAttCache;
+    delete[] selfAtt;
    delete[] fnns;
-    delete[] attLayerNorms;
-    delete[] attentionsEnde;
-    delete[] attEndeLayerNorms;
-    delete decodeLayerNorm;
+    delete[] selfAttLayerNorms;
+    delete[] enDeAtt;
+    delete[] enDeAttLayerNorms;
+    delete decoderLayerNorm;
 }

 /* 
@@ -71,7 +71,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;

-    LoadParamInt(argc, argv, "nlayer", &nlayer, 3);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 4);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsizetgt", &vSize, 34040);
@@ -83,24 +83,24 @@ void AttDecoder::InitModel(int argc, char ** argv,
    /* embedding model */
    embedder.InitModel(argc, argv, devID, false);

-    attentions = new T2TAttention[nlayer];
+    selfAtt = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
-    attLayerNorms = new T2TLN[nlayer];
-    attentionsEnde = new T2TAttention[nlayer];
-    attEndeLayerNorms = new T2TLN[nlayer];
-    decodeLayerNorm = new T2TLN;
-    selfCache = new Cache[nlayer];
-    contextCache = new Cache[nlayer];
+    selfAttLayerNorms = new T2TLN[nlayer];
+    enDeAtt = new T2TAttention[nlayer];
+    enDeAttLayerNorms = new T2TLN[nlayer];
+    decoderLayerNorm = new T2TLN;
+    selfAttCache = new Cache[nlayer];
+    enDeAttCache = new Cache[nlayer];

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        selfAtt[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
        fnns[i].InitModel(argc, argv, myDevID);
-        attLayerNorms[i].InitModel(argc, argv, myDevID);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
+        selfAttLayerNorms[i].InitModel(argc, argv, myDevID);
+        enDeAtt[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        enDeAttLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    decodeLayerNorm->InitModel(argc, argv, myDevID);
+    decoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -131,48 +131,38 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor *mask, X
        XTensor attNorm;

        /* layer normalization */
-        inputNorm = attLayerNorms[i].Make(x);
-        //inputNorm.Dump(stderr, "inputNorm", 10);
+        inputNorm = selfAttLayerNorms[i].Make(x);

        /******************/
        /* self attention */
-        att = attentions[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfCache[i], SELF_ATT);
+        att = selfAtt[i].Make(inputNorm, inputNorm, inputNorm, NULL, isTraining, &selfAttCache[i], SELF_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            att = Dropout(att, dropoutP);

        /* residual connection */
-        _SumMe(&att, &x);
-        //att.Dump(stderr, "Sum(att, x)", 10);
+        att = att + x;

        /* layer normalization */
-        attNorm = attEndeLayerNorms[i].Make(att);
-        //attNorm.Dump(stderr, "attNorm", 10);
+        attNorm = enDeAttLayerNorms[i].Make(att);

        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &contextCache[i], EN_DE_ATT);
-
-        //ende.Dump(stderr, "ende atten", 10);
+        ende = enDeAtt[i].Make(outputEnc, attNorm, outputEnc, &maskEncDec, isTraining, &enDeAttCache[i], EN_DE_ATT);

        /* dropout */
        if(isTraining && dropoutP > 0)
            ende = Dropout(ende, dropoutP);

        /* residual connection */
-        _SumMe(&ende, &att);
-        //res.Dump(stderr, "Sum(ende, att)", 10);
+        ende = ende + att;

        /* fnn */
        x = fnns[i].Make(ende, isTraining);
-        //x.Dump(stderr, "fnns[i]", 10);

    }

-    x = decodeLayerNorm->Make(x);
-    //x.Dump(stderr, "decodeLayerNorm", 10);
-    
-    x.SetName(DECODING_NAME);
+    x = decoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -63,13 +63,13 @@ public:
    T2TFNN * fnns;

    /* attention model of each layer */
-    T2TAttention * attentions;
+    T2TAttention * selfAtt;

    /* layer normalization for attention */
-    T2TLN * attLayerNorms;
+    T2TLN * selfAttLayerNorms;

    /* layer normalization for decoder */
-    T2TLN * decodeLayerNorm;
+    T2TLN * decoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;
@@ -78,16 +78,16 @@ public:
    XTensor * output;

    /* encoder-decoder attention model of each layer */
-    T2TAttention * attentionsEnde;
+    T2TAttention * enDeAtt;

    /* layer normalization for encoder-decoder attention */
-    T2TLN * attEndeLayerNorms;
+    T2TLN * enDeAttLayerNorms;

    /* layer cache list */
-    Cache* selfCache;
+    Cache* selfAttCache;

    /* layer cache list */
-    Cache* contextCache;
+    Cache* enDeAttCache;

 public:
    /* constructor */

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "pad", &padIdx, 1);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    maxLength = maxLength + 1 + 1;
    DTYPE v = 1.0F/(float)sqrt((float)eSize);
@@ -80,7 +80,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length, int padIdx)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -113,47 +113,47 @@ make the network
 */
 XTensor T2TEmbedder::Make(XTensor &input, int prevLen)
 {
-    /* assert padding index is 1 */
+    ///* assert padding index is 1 */

-    CheckNTErrors(input.order > 1, "Wrong input tensor size!");
-    CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
-    CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
-    CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");
+    //CheckNTErrors(input.order > 1, "Wrong input tensor size!");
+    //CheckNTErrors(input.dimSize[input.order - 1] < maxLength, "The sequence is too long!");
+    //CheckNTErrors(vSize > 0, "set vocabulary size by \"-vsize\"");
+    //CheckNTErrors(eSize > 0, "set embedding size by \"-esize\"");

+    //
+    //XTensor wordEmbedding, position, posEmbedding;
+    //InitTensor(&position, &input);

-    XTensor wordEmbedding, position, posEmbedding;
-    InitTensor(&position, &input);
+    //int* posData = new int[input.unitNum];

-    int* posData = new int[input.unitNum];
+    //XTensor inputCPU;
+    //InitTensorOnCPU(&inputCPU, &input);
+    //_CopyValues(&input, &inputCPU);

-    XTensor inputCPU;
-    InitTensorOnCPU(&inputCPU, &input);
-    _CopyValues(&input, &inputCPU);

+    //for (int i = 0; i < inputCPU.GetDim(0); i++) {
+    //    int startNoPad = 2 + prevLen - 1;
+    //    int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
+    //    for (int j = 0; j < inputCPU.GetDim(1); j++) {
+    //        if (p[j] == 1) {
+    //            posData[i * inputCPU.GetDim(1) + j] = 1;
+    //        }
+    //        else {
+    //            posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
+    //        }
+    //    }
+    //}

-    for (int i = 0; i < inputCPU.GetDim(0); i++) {
-        int startNoPad = 2 + prevLen - 1;
-        int* p = ((int*)inputCPU.data) + i * inputCPU.GetDim(1);
-        for (int j = 0; j < inputCPU.GetDim(1); j++) {
-            if (p[j] == 1) {
-                posData[i * inputCPU.GetDim(1) + j] = 1;
-            }
-            else {
-                posData[i * inputCPU.GetDim(1) + j] = startNoPad++;
-            }
-        }
-    }
+    //position.SetData(posData, position.unitNum);
+    //delete[] posData;

-    position.SetData(posData, position.unitNum);
-    delete[] posData;
-
-    /* we make positional embeddings first */
-    if(true){
-        posEmbedding = Gather(posEmbeddingBase, position);
-    }
+    ///* we make positional embeddings first */
+    //if(true){
+    //    posEmbedding = Gather(posEmbeddingBase, position);
+    //}

    /* then we make word embeddings */
-
+    XTensor wordEmbedding;
    wordEmbedding = Gather(w, input);

    wordEmbedding = Linear(wordEmbedding, (float)sqrt((float)eSize));

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -29,7 +29,7 @@ using namespace nts;
 namespace transformer
 {

-#define DEFAULT_EMBEDDING_SIZE 512
+#define DEFAULT_EMBEDDING_SIZE 128

 /* 
 embedding (of word at position i):

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -34,7 +34,7 @@ AttEncoder::AttEncoder()
    attentions = NULL;
    fnns = NULL;
    attLayerNorms = NULL;
-    encodeLayerNorm = NULL;
+    encoderLayerNorm = NULL;
 }

 /* de-constructor */
@@ -43,7 +43,7 @@ AttEncoder::~AttEncoder()
    delete[] attentions;
    delete[] fnns;
    delete[] attLayerNorms;
-    delete encodeLayerNorm;
+    delete encoderLayerNorm;
 }

 /* 
@@ -61,7 +61,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    devID = myDevID;
    ignored = myIgnored;
    
-    LoadParamInt(argc, argv, "nlayer", &nlayer, 35);
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 20);
    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
@@ -76,7 +76,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
    attLayerNorms = new T2TLN[nlayer];
-    encodeLayerNorm = new T2TLN;
+    encoderLayerNorm = new T2TLN;

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
@@ -84,7 +84,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
        fnns[i].InitModel(argc, argv, myDevID);
        attLayerNorms[i].InitModel(argc, argv, myDevID);
    }
-    encodeLayerNorm->InitModel(argc, argv, myDevID);
+    encoderLayerNorm->InitModel(argc, argv, myDevID);
 }

 /* 
@@ -123,13 +123,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor *mask, XTensor &maskEncDec, boo

        /* fnn */
        x = fnns[i].Make(res, isTraining);
-
    }

-    x = encodeLayerNorm->Make(x);
-
-    x.SetName(ENCODING_NAME);
-    input.SetName(ENCODING_INPUT_NAME);
+    x = encoderLayerNorm->Make(x);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -93,11 +93,11 @@ public:
    /* attention model of each layer */
    T2TAttention * attentions;

-    /* layer normalization for attention */
+    /* layer normalizations for attention */
    T2TLN * attLayerNorms;

    /* layer normalization for encoder */
-    T2TLN * encodeLayerNorm;
+    T2TLN * encoderLayerNorm;

    /* input tensor of the encoder */
    XTensor * input;

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-31
  */

@@ -48,7 +48,7 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myDevID - device id
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
+void T2TFNN::InitModel(int argc, char** argv, int myDevID)
 {
    devID = myDevID;

@@ -56,14 +56,14 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID)

    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &outSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 4);
+    LoadParamInt(argc, argv, "fnnh", &hSize, outSize * 8);
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2DV2(&w1, hSize, inSize, X_FLOAT, devID);
+    InitTensor2DV2(&w1,  inSize, hSize, X_FLOAT, devID);
    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2DV2(&w2, outSize, hSize, X_FLOAT, devID);
+    InitTensor2DV2(&w2, hSize, outSize,  X_FLOAT, devID);
    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    fnnLayerNorm.InitModel(argc, argv, myDevID);
@@ -84,19 +84,19 @@ y = max(0, x * w1 + b1) * w2 + b2
 >> input - the input tensor
 >> return - the output tensor
 */
-XTensor T2TFNN::Make(XTensor &input, bool isTraining)
+XTensor T2TFNN::Make(XTensor& input, bool isTraining)
 {
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), X_NOTRANS, w1, X_TRANS, b1));
+    t1 = Rectify(MulAndShift(fnnLayerNorm.Make(input), w1, b1));

-    if(isTraining && dropoutP > 0)
+    if (isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
    XTensor res;
-    res = MulAndShift(t1, X_NOTRANS, w2, X_TRANS, b2);
+    res = MulAndShift(t1, w2, b2);
    _SumMe(&res, &input);
    return  res;
 }

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -53,8 +53,8 @@ void T2TLN::InitModel(int argc, char ** argv, int myDevID)
    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID);
-    InitTensor1D(&b, d, X_FLOAT, devID);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);
 }

 /*
@@ -78,7 +78,7 @@ XTensor T2TLN::Make(XTensor &input)
    mean = ReduceMean(x, x.order - 1);

    /* \sigma = (sum_i (x_i - \mu)^2)/m */
-    variance = ReduceVariance(x, x.order - 1, mean);
+    variance = ReduceVariance(x, x.order - 1, mean) + 1e-5F;

    /* standard = sqrt(variance) */
    standard = Power(variance, 0.5F);

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -103,7 +103,7 @@ public:
    /* read the parameters */
    void Read(const char * fn);
 };
-void FastRead(XTensor* x, FILE* f);
+
 }

 #endif
--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,13 +56,11 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
-    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID);
+    InitTensor2DV2(&w, vSize, hSize, X_FLOAT, devID);
 }


-
 /* 
 make the network (redefined output tensor) 
 >> input - input tensor
@@ -72,9 +70,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_NOTRANS), -1);
-
-    output.SetName(OUTPUT_NAME);
+    output = LogSoftmax(MMul(x, X_NOTRANS, w, X_TRANS), -1);
 }

 }
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
  */

@@ -38,7 +38,7 @@ T2TStateBundle::T2TStateBundle()
 /* de-constructor */
 T2TStateBundle::~T2TStateBundle()
 {
-    if(states != NULL)
+    if (states != NULL)
        delete[] states;
 }

@@ -50,12 +50,12 @@ void T2TStateBundle::MakeStates(int num)
 {
    CheckNTErrors(num > 0, "invalid number");

-    if(states != NULL)
+    if (states != NULL)
        delete[] states;

    states = new T2TState[num];

-    for(int i = 0; i < num; i++){
+    for (int i = 0; i < num; i++) {
        states[i].prediction = -1;
        states[i].pid = T2T_PID_EMPTY;
        states[i].isEnd = false;
@@ -74,7 +74,7 @@ void T2TStateBundle::MakeStates(int num)
 /* constructor */
 T2TPredictor::T2TPredictor()
 {
-    startSymbol = -1;
+    startSymbol = 2;
 }

 /* de-constructor */
@@ -90,29 +90,36 @@ create an initial state
 >> beamSize - beam size
 >> state - the state to be initialized
 */
-void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+void T2TPredictor::Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state)
 {
    int dims[MAX_TENSOR_DIM_NUM];
    for (int i = 0; i < input->order - 1; i++)
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, input->devID);
-    InitTensor(&state->endMark, input->order, dims, X_INT, input->devID);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID);

-    float* data = new float[state->probPath.unitNum];
+    /*float* data = new float[state->probPath.unitNum];
    for (int i = 0; i < state->probPath.unitNum; ++i) {
        data[i] = -1e20F;
        if (i % beamSize == 0)
            data[i] = 0;
    }
    state->probPath.SetData(data, state->probPath.unitNum);
+    delete[] data;*/
+
+    SetDataFixed(state->probPath, -1e9F);
+    for (int i = 0; i < state->probPath.unitNum; ++i) {
+        if (i % beamSize == 0)
+            state->probPath.Set(0.0F, i);
+    }
    
    state->nstep.SetZeroAll();
    state->endMark.SetZeroAll();

-    delete[] data;
+    
    state->stateNum = 0;
 }

@@ -133,7 +140,7 @@ read a state
             2) probablities of hypotheses
             3) parts of the network for expanding toward the next state
 */
-void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+void T2TPredictor::Read(T2TModel* model, T2TStateBundle* state)
 {
    m = model;
    s = state;
@@ -147,8 +154,7 @@ predict the next state
 >> paddingEnc - padding of the encoder
 >>> isStart - is the start or not
 */
-void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
-                           XTensor * inputEnc, XTensor * paddingEnc, bool isStart)
+void T2TPredictor::Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart)
 {
    int dims[MAX_TENSOR_DIM_NUM];

@@ -159,40 +165,41 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    XTensor first;

    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
-    for(int i = 0; i < inputEnc->order - 1; i++)
+    for (int i = 0; i < inputEnc->order - 1; i++)
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensor(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID);
    SetDataFixedInt(first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
    if (isStart) {
        inputDec = Identity(first);
    }
-    else{
+    else {
        /* only pass one step to the decoder */
        inputDec = GetLastPrediction(s);
        inputDec.SetDevice(inputEnc->devID);
    }
    
+
    /* prediction probabilities */
-    XTensor &output = next->prob;
+    XTensor& output = next->prob;
    XTensor decoding;

-    for(int i = 0; i < inputDec.order - 1; i++)
+    for (int i = 0; i < inputDec.order - 1; i++)
        dims[i] = inputDec.GetDim(i);
    dims[inputDec.order - 1] = inputDec.GetDim(-1);

    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID);
    SetDataFixedInt(paddingDec, 1);

    XTensor maskDec;
    XTensor maskEncDec;

    /* decoder mask */
-    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);
+    //m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec, 0);

    /* make the decoding network */
    decoding = m->decoder->Make(inputDec, *encoding, NULL, maskEncDec, false);
@@ -207,34 +214,34 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
 generate paths up to the states of the current step
 >> state - state bundle of the current step
 */
-XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle* state)
 {
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    int distance = -1;

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            cur = cur->last;
        }

-        if(nsteps > distance)
+        if (nsteps > distance)
            distance = nsteps;
    }

    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

-    for(int i = 0; i < state->stateNum; i++){
-        T2TState * cur = state->states + i;
+    for (int i = 0; i < state->stateNum; i++) {
+        T2TState* cur = state->states + i;
        int nsteps = 0;

-        while(cur != NULL){
+        while (cur != NULL) {
            nsteps++;
            path.Set2DInt(cur->prediction, i, distance - nsteps);
            cur = cur->last;
@@ -253,7 +260,7 @@ XTensor T2TPredictor::GetLastPrediction(T2TStateBundle* state)
    CheckNTErrors(state->stateNum >= 0, "Illegal state!");

    XTensor lastPred;
-    InitTensor2D(&lastPred, state->stateNum, 1, X_INT);
+    InitTensor2DV2(&lastPred, state->stateNum, 1, X_INT);

    for (int i = 0; i < state->stateNum; i++) {
        T2TState* cur = state->states + i;

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
  * This is the first source file I create in 2019 - new start!
  */
@@ -66,7 +66,7 @@ public:
    int nstep;

    /* pointer to the previous state */
-    T2TState * last;
+    T2TState* last;
 };

 /* a bundle of states */
@@ -95,7 +95,7 @@ public:
    XTensor nstep;

    /* list of states */
-    T2TState * states;
+    T2TState* states;

    /* number of states */
    int stateNum;
@@ -123,10 +123,10 @@ class T2TPredictor
 {
 private:
    /* pointer to the transformer model */
-    T2TModel * m;
+    T2TModel* m;

    /* current state */
-    T2TStateBundle * s;
+    T2TStateBundle* s;

    /* start symbol */
    int startSymbol;
@@ -139,19 +139,19 @@ public:
    ~T2TPredictor();

    /* create an initial state */
-    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+    void Create(T2TModel* model, XTensor* top, const XTensor* input, int beamSize, T2TStateBundle* state);

    /* set the start symbol */
    void SetStartSymbol(int symbol);

    /* read a state */
-    void Read(T2TModel * model, T2TStateBundle * state);
+    void Read(T2TModel* model, T2TStateBundle* state);

    /* predict the next state */
-    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc, bool isStart);
+    void Predict(T2TStateBundle* next, XTensor* encoding, XTensor* inputEnc, XTensor* paddingEnc, bool isStart);

    /* generate paths up to the states of the current step */
-    XTensor GeneratePaths(T2TStateBundle * state);
+    XTensor GeneratePaths(T2TStateBundle* state);

    /* get the predictions of the previous step */
    XTensor GetLastPrediction(T2TStateBundle* state);

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  */

@@ -51,10 +51,10 @@ private:
    int batchSize;

    /* we keep the final hypotheses in a heap for each sentence in the batch. */
-    XHeap<MIN_HEAP, float> * fullHypos;
+    XHeap<MIN_HEAP, float>* fullHypos;

    /* array of the end symbols */
-    int * endSymbols;
+    int* endSymbols;

    /* number of the end symbols */
    int endSymbolNum;
@@ -70,40 +70,40 @@ public:
    ~T2TSearch();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+    void Search(T2TModel* model, XTensor* input, XTensor* padding, XTensor* output);

    /* preparation */
-    void Prepare(int myBatchSize,int myBeamSize);
+    void Prepare(int myBatchSize, int myBeamSize);

    /* compute the model score for each hypothesis */
-    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Score(T2TStateBundle* prev, T2TStateBundle* beam);

    /* generate token indices via beam pruning */
-    void Generate(T2TStateBundle * beam);
+    void Generate(T2TStateBundle* beam);

    /* expand the search graph */
-    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+    void Expand(T2TStateBundle* prev, T2TStateBundle* beam);

    /* collect hypotheses with ending symbol */
-    void Collect(T2TStateBundle * beam);
+    void Collect(T2TStateBundle* beam);

    /* fill the hypotheis heap with incomplete hypothses */
-    void FillHeap(T2TStateBundle * beam);
+    void FillHeap(T2TStateBundle* beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output);
+    void Dump(XTensor* output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);

    /* set end symbols for search */
-    void SetEnd(const int * tokens, const int tokenNum);
+    void SetEnd(const int* tokens, const int tokenNum);

    /* make a mask to prevent duplicated entries in beam expansion for the first position */
-    XTensor MakeFirstMask(T2TStateBundle * beam);
+    XTensor MakeFirstMask(T2TStateBundle* beam);
 };

 }

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  */

@@ -44,7 +44,7 @@ T2TTester::~T2TTester()
 }

 /* initialize the model */
-void T2TTester::Init(int argc, char ** argv)
+void T2TTester::Init(int argc, char** argv)
 {
    LoadParamInt(argc, argv, "vsize", &vSize, 34040);
    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
@@ -60,7 +60,7 @@ test the model
 >> ofn - output data file
 >> model - model that is trained
 */
-void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+void T2TTester::Test(const char* fn, const char* ofn, T2TModel* model)
 {
    int wc = 0;
    int wordCount = 0;
@@ -94,8 +94,8 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        count++;
        wordCount = 0;
        for (int i = 0; i < model->decoder->nlayer; ++i) {
-            model->decoder->selfCache[i].miss = true;
-            model->decoder->contextCache[i].miss = true;
+            model->decoder->selfAttCache[i].miss = true;
+            model->decoder->enDeAttCache[i].miss = true;
        }

        vector<int> indices = batchLoader.LoadBatch(&batchEnc, &paddingEnc, sentBatch, devID);
@@ -103,14 +103,14 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        XTensor output;

        seacher.Search(model, &batchEnc, &paddingEnc, &output);
-
+        output.Dump(stderr);
        for (int i = 0; i < indices.size(); ++i) {
            Result res;
            XTensor sent, srcIdx, tgtIdx;
-            InitTensor1D(&srcIdx, 1, X_INT, output.devID);
-            int idx[]{i};
+            InitTensor1DV2(&srcIdx, 1, X_INT, output.devID);
+            int idx[]{ i };
            srcIdx.SetData(idx, 1);
-            InitTensor(&tgtIdx, &srcIdx);
+            InitTensorV2(&tgtIdx, &srcIdx);
            SetAscendingOrder(tgtIdx, 0);

            sent = CopyIndexed(output, 0, srcIdx, tgtIdx);
@@ -153,7 +153,7 @@ dump the result into the file
 >> file - data file
 >> output - output tensor
 */
-void T2TTester::Dump(FILE * file, XTensor * output)
+void T2TTester::Dump(FILE* file, XTensor* output)
 {
    int seqLength = output->GetDim(-1);


--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

-/*
+ /*
  * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
  * A week with no trips :)
  */
@@ -56,13 +56,13 @@ public:
    ~T2TTester();

    /* initialize the model */
-    void Init(int argc, char ** argv);
+    void Init(int argc, char** argv);

    /* test the model */
-    void Test(const char * fn, const char * ofn, T2TModel * model);
+    void Test(const char* fn, const char* ofn, T2TModel* model);

    /* dump the result into the file */
-    void Dump(FILE * file, XTensor * output);
+    void Dump(FILE* file, XTensor* output);
 };

 }

--- a/source/sample/transformer/Transformer.h
+++ b/source/sample/transformer/Transformer.h
@@ -38,7 +38,7 @@ namespace transformer
 {

 /* entrance of the program */
-int TransformerMain(int argc, const char ** argv);
+int TransformerMain(int argc, const char** argv);

 }


--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -28,6 +28,7 @@
 #include "XList.h"
 #include "XGlobal.h"

+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

@@ -363,6 +364,8 @@ template struct TensorListBase<long>;
 template struct TensorListBase<float>;
 template struct TensorListBase<short>;
 template struct TensorListBase<XTensor*>;
+template struct TensorListBase<uint64_t>;
 template struct TensorListBase<void*>;

+
 } /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -26,6 +26,8 @@
 #include "XMem.h"
 #include "XGlobal.h"

+#include <cstdint>
+
 #ifndef __TensorList_H__
 #define __TensorList_H__

@@ -118,7 +120,14 @@ public:
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    T& operator[] (int i) { return GetItem(i); };
+    T& operator[] (int i) { 
+        CheckNTErrors(i >= -count && i < count, "Index of a list item is out of scope!");
+        CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
+        if (i < 0)
+            return items[count + i];
+        else
+            return items[i];
+    };
    T& Get(int i) { return GetItem(i); };
    void Set(int i, T item) { SetItem(i, item); };
 };
@@ -132,7 +141,7 @@ typedef TensorListBase<char*> StrList;
 typedef TensorListBase<long> LongList;
 typedef TensorListBase<float> FloatList;
 typedef TensorListBase<short> ShortList;
-
+typedef TensorListBase<uint64_t> UInt64List;
 typedef TensorListBase<XTensor*> TensorList;

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -86,7 +86,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
                    vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip)+j * vecBufLength);                                 \
                }                                                                                                   \
                for (int j = 1; j < strideNum / 32; j++) {                                                          \
-                    const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);                                             \
+                    const DTYPE* ptr = (DTYPE*)(ip + j * 4 * vecBufLength);                                         \
                    vecBuf[0] = vecBuf[0]._vectorOp(VectorBuffer::loadu(ptr + 0 * vecBufLength));                   \
                    vecBuf[1] = vecBuf[1]._vectorOp(VectorBuffer::loadu(ptr + 1 * vecBufLength));                   \
                    vecBuf[2] = vecBuf[2]._vectorOp(VectorBuffer::loadu(ptr + 2 * vecBufLength));                   \
@@ -106,7 +106,7 @@ void _funcCPUName(const XTensor * input, XTensor * output, int dim)             
        else {                                                                                                      \
            /* data is separated */                                                                                 \
            for(int i = 0; i < blockNum; i++){                                                                      \
-                for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){                                     \
+                for(int j = 0; j < stride / 32; j++){                                                               \
                    DTYPE * ip = (DTYPE*)input->data + blockSize * i;                                               \
                    DTYPE * op = (DTYPE*)output->data + stride * i;                                                 \
                    VectorBuffer vecBuf[4];                                                                         \

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -42,7 +42,7 @@ void _ReduceMean(const XTensor * input, XTensor * output, int dim)
    int num = input->dimSize[dim];

    _ReduceSum(input, output, dim);
-    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
+    _ScaleAndShiftMe(output, 1.0F/(DTYPE)(num), 0);
 }

 /* 

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -105,7 +105,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        vecBuf[j] = VectorBuffer::loadu((DTYPE*)(ip) + j * vecBufLength, isExp, power, bias);
                    }
                    for(int j = 1; j < strideNum / 32; j++){
-                        const DTYPE* ptr = (DTYPE*)(ip + j * vecBufLength);
+                        const DTYPE* ptr = (DTYPE*)(ip + (j * 4) * vecBufLength);
                        vecBuf[0] = vecBuf[0] + VectorBuffer::loadu(ptr + 0 * vecBufLength, isExp, power, bias);
                        vecBuf[1] = vecBuf[1] + VectorBuffer::loadu(ptr + 1 * vecBufLength, isExp, power, bias);
                        vecBuf[2] = vecBuf[2] + VectorBuffer::loadu(ptr + 2 * vecBufLength, isExp, power, bias);
@@ -122,7 +122,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
            } else{
                //data is separated
                for(int i = 0; i < blockNum; i++){
-                    for(int j = 0; j < input->dimSize[input->order - 1] / 32; j++){
+                    for(int j = 0; j < stride / 32; j++){
                        DTYPE * ip = (DTYPE*)input->data + blockSize * i;
                        DTYPE * op = (DTYPE*)output->data + stride * i;
                        DTYPE * sp = shift != NULL ? (DTYPE*)shift->data + stride * i : NULL;
@@ -133,8 +133,7 @@ void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor 
                        }
                        VectorBuffer vecBuf[4];
                        for(int k = 0; k < 4; k++){
-                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + j * 32 / sizeof(DTYPE));
-
+                            vecBuf[k] = VectorBuffer::loadu((DTYPE*)(ip) + (j * 4 + k) * 32 / sizeof(DTYPE), isExp, power, bias + k * 32 / sizeof(DTYPE));
                        }
                        for(int k = 1; k < strideNum; k++){
                            DTYPE * ptr = ip + k * stride + (j * 4) * vecBufLength;