implement MulAndShift and bug fixs

3bf085db · 姜雨帆 · 411faffa · 3bf085db · 3bf085db · 3bf085db
Commit 3bf085db authored Mar 13, 2019 by 姜雨帆
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -99,6 +99,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
+    else if (operID == MATH_MULANDSHIFT)
+        GradMulAndShift(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -1487,4 +1489,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }
+/*
+gradient for operation
+for c = matmul(x, w) + b 
+we have
+dE/dx = dE/dc * w^T
+dE/dw = x^T * dE/dc
+dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 3, "wrong input tensor number")
+    XTensor * x = income.tails[0];
+    XTensor * w = income.tails[1];
+    XTensor * b = income.tails[2];
+    int n = income.GetParamInt(0);
+    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
+    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);
+    int order = node->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+    /* compute dE/db */
+    if (n == order - 1) {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = node->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+        size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(node->grad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+        node->grad->Reshape(order, dimSize);
+    }
+    else {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+        for (int i = 0; i < order; i++) {
+            if (i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+        Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        _ReduceSum(node->grad, interGrad, 2);
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(interGrad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+        node->grad->Reshape(order, dimSize);
+        DelTensorBuf(interGrad);
+    }
+    /* compute dE/dx, dE/dw */
+    XTensor * c = node;
+    XTensor * dedc = node->grad;
+    XTensor * dedw = w->grad;
+    XTensor * dedx = x->grad;
+    if (x->order == 2 && w->order == 2)
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+        int orderBackupX = x->order;
+        int orderBackupC = c->order;
+        int dimsBackupX[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupX, x->dimSize, sizeof(int) * x->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+        x->Reshape(x->unitNum / x->GetDim(-1), x->GetDim(-1));
+        c->Reshape(c->unitNum / c->GetDim(-1), c->GetDim(-1));
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(dedx->unitNum / dedx->GetDim(-1), dedx->GetDim(-1));
+        dedc->Reshape(dedc->unitNum / dedc->GetDim(-1), dedc->GetDim(-1));
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+        x->Reshape(orderBackupX, dimsBackupX);
+        c->Reshape(orderBackupC, dimsBackupC);
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(orderBackupX, dimsBackupX);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+    }
+    node->visitMark = NODE_FINISHED;
+}
 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -168,6 +168,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };
 }

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
    wv.SetDataRand(-finfoutv, finfoutv);
    wa.SetDataRand(-finfouta, finfouta);
+    wbig.SetDataRand(-finfoutbig, finfoutbig);
 }
 /* 
@@ -98,16 +101,40 @@ make the network
 >> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
 {
    XTensor k2;
    XTensor q2;
    XTensor v2;
-    /* linear transofmration before self-attention */
+    if (selfatt){
-    k2 = MMul(k, wk);
-    q2 = MMul(q, wq);
+        XTensor con;
-    v2 = MMul(v, wv);
+        XList split;
+        con = MMul(k, wbig);
+        int d1 = con.GetDim(0);
+        int d2 = con.GetDim(1);
+        int d3 = con.GetDim(2) / 3;
+        InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+        InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+        InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+        split.Add(&q2);
+        split.Add(&k2);
+        split.Add(&v2);
+        Split(con, split, 2, 3);
+    }
+    else{
+        /* linear transofmration before self-attention */
+        k2 = MMul(k, wk);
+        q2 = MMul(q, wq);
+        v2 = MMul(v, wv);
+    }
    XTensor kheads;
    XTensor qheads;

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -59,7 +59,9 @@ public:
    /* transformation after dot-product attention */
    XTensor wa;
+    XTensor wbig;
    /* size of transformed Q and K */
    int dk;
@@ -95,7 +97,7 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
 };
 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -21,6 +21,8 @@
 #include <math.h>
 #include "T2TDecoder.h"
+#include "T2TUtility.h"
+#include "T2TLayerNormal.h"
 #include "../../tensor/core/CHeader.h"
 namespace transformer
@@ -53,14 +55,38 @@ void AttDecoder::InitModel(int argc, char ** argv,
                           bool myIsMasked, int myIgnored, 
                           int myDevID, XMem * myMem)
 {
-    AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    devID = myDevID;
+    mem = myMem;
+    ignored = myIgnored;
+    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
+    LoadParamInt(argc, argv, "hsize", &hSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "esize", &eSize, DEFAULT_EMBEDDING_SIZE);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);
+    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    /* embedding model */
+    embedder.InitModel(argc, argv, devID, mem, false);
+    attentions = new T2TAttention[nlayer];
+    fnns = new T2TFNN[nlayer];
+    attLayerNorms = new T2TLN[nlayer];
+    fnnLayerNorms = new T2TLN[nlayer];
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];
    /* initialize the stacked layers */
-    for(int i = 0; i < nlayer; i++){
+    for (int i = 0; i < nlayer; i++) {
-        attentionsEnde[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+        fnns[i].InitModel(argc, argv, myDevID, myMem);
+        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }
@@ -93,7 +119,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -107,7 +133,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -27,9 +27,56 @@
 namespace transformer
 {
-class AttDecoder : public AttEncoder
+class AttDecoder
 {
 public:
+    /* device id */
+    int devID;
+    /* memory pool */
+    XMem * mem;
+    /* layer number */
+    int nlayer;
+    /* hidden layer size of the FNN layer */
+    int hSize;
+    /* embedding size */
+    int eSize;
+    /* vocabulary size */
+    int vSize;
+    /* dropout probability */
+    DTYPE dropoutP;
+    /* some positions can be ignored in attention. this is useful in lm where the first position needs
+ *     special design for the attention model. */
+    int ignored;
+    /* embedding of word at each position */
+    T2TEmbedder embedder;
+    /* FNN model of each layer */
+    T2TFNN * fnns;
+    /* attention model of each layer */
+    T2TAttention * attentions;
+    /* layer normalization for fnn */
+    T2TLN * fnnLayerNorms;
+    /* layer normalization for attention */
+    T2TLN * attLayerNorms;
+    /* input tensor of the encoder */
+    XTensor * input;
+    /* output tensor of the encoder */
+    XTensor * output;
    /* encoder-decoder attention model of each layer */
    T2TAttention * attentionsEnde;

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -48,12 +48,18 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
 {
    devID = myDevID;
    mem = myMem;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    if(isEnc){
+        LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    }
+    else{
+        LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
+    }
+    //LoadParamInt(argc, argv, "vsize", &vSize, -1);
    LoadParamInt(argc, argv, "maxlen", &maxLength, 512);
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -71,7 +71,7 @@ public:
    ~T2TEmbedder();
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -114,7 +114,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining);
+        att = attentions[i].Make(x, x, x, mask, isTraining, true);
        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;
    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);
    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -219,7 +219,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
-    _Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
+    //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
-    _Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
+    //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
-    _ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
-    _Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
@@ -300,9 +300,10 @@ void T2TModel::GetParams(XList &list)
        list.Add(&encoder->fnns[i].b1);
        list.Add(&encoder->fnns[i].w2);
        list.Add(&encoder->fnns[i].b2);
-        list.Add(&encoder->attentions[i].wk);
+        //list.Add(&encoder->attentions[i].wk);
-        list.Add(&encoder->attentions[i].wq);
+        //list.Add(&encoder->attentions[i].wq);
-        list.Add(&encoder->attentions[i].wv);
+        //list.Add(&encoder->attentions[i].wv);
+        list.Add(&encoder->attentions[i].wbig);
        list.Add(&encoder->attentions[i].wa);
        list.Add(&encoder->fnnLayerNorms[i].w);
        list.Add(&encoder->fnnLayerNorms[i].b);
@@ -324,9 +325,10 @@ void T2TModel::GetParams(XList &list)
            list.Add(&decoder->attentionsEnde[i].wa);
            list.Add(&decoder->attEndeLayerNorms[i].w);
            list.Add(&decoder->attEndeLayerNorms[i].b);
-            list.Add(&decoder->attentions[i].wk);
+            //list.Add(&decoder->attentions[i].wk);
-            list.Add(&decoder->attentions[i].wq);
+            //list.Add(&decoder->attentions[i].wq);
-            list.Add(&decoder->attentions[i].wv);
+            //list.Add(&decoder->attentions[i].wv);
+            list.Add(&decoder->attentions[i].wbig);
            list.Add(&decoder->attentions[i].wa);
            list.Add(&decoder->fnnLayerNorms[i].w);
            list.Add(&decoder->fnnLayerNorms[i].b);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    float minmax = 0;
-    LoadParamInt(argc, argv, "vsize", &vSize, -1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSize, -1);
    LoadParamInt(argc, argv, "d", &inSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -208,10 +208,10 @@ public:
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batchEnc, XTensor * paddingEnc, 
                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
+                  XTensor * gold, XTensor * label,
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
+                  bool isSorted, int &ws, int &wCount,
                  int devID, XMem * mem, 
 				  bool isTraining);
@@ -219,7 +219,7 @@ public:
    int LoadBatchLM(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc,
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
                    int devID, XMem * mem, 
@@ -229,9 +229,9 @@ public:
    int LoadBatchMT(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc, 
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
+                    bool isSorted, int &ws, int &wCount,
                    int devID, XMem * mem, 
 					bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -57,6 +57,8 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    srand((unsigned int)time(NULL));
    T2TTrainer trainer;
    trainer.Init(argc, args);
@@ -68,12 +70,12 @@ int TransformerMain(int argc, const char ** argv)
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+        //model.Dump(modelFN);
    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
+    //if(strcmp(modelFN, ""))
-        model.Read(modelFN);
+        //model.Read(modelFN);
    T2TTrainer tester;
    tester.Init(argc, args);

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -307,6 +307,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    MakeLink(&list, h, id);
 }
+/*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+    XList list(3);
+    list.Add(t1);
+    list.Add(t2);
+    list.Add(t3);
+    MakeLink(&list, h, id);
+}
 /* 
 create a hyper edge with a list of tensors and a output tensor 
 >> list - a list of input tensors

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -138,6 +138,10 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
+    /* create a hyper edge with three input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
    void MakeLink(const XList * list, XTensor * h, int id);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,8 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -57,7 +57,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SIGN               MATH_MULANDSHIFT + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -44,6 +44,7 @@
 #include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"
 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "MulAndShift.h"
+#include "MatrixMul.h"
+#include "Sum.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
+>> a - a tensor
+>> b - another tensor for sum
+*/
+int GetSumIndex(const XTensor &a, const XTensor &b)
+{
+    if (a.order < b.order)
+        return -1;
+    if (XTensor::IsSameShaped(&a, &b))
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for (int i = 0; i < b.order; i++) {
+        if (b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if (hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+/*
+operation c = x * w + b  MulAndShift
+>> x - tensor x
+>> w - tensor w
+>> b - tensor b
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
+    int xn = x.dimSizeRDI[1];
+    int xm = x.dimSizeRDI[0];
+    int wn = w.dimSizeRDI[1];
+    int wm = w.dimSizeRDI[0];
+    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
+    int order = x.order + w.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < x.order; i++)
+        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+    for (int i = 2; i < w.order; i++)
+        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    dimSize[sub++] = xn;
+    dimSize[sub++] = wm;
+    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    /* call _MatrixMul function */
+    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
+    XTensor c(tmp);
+    c.SetTMPFlag();
+    int n = GetSumIndex(tmp, b);
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(tmp, &b, &c);
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+    else if (n >= 0 && n < tmp->order) {
+        /* call _SumDim function */
+        _SumDim(tmp, &b, &c, n);
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+    /* tensor connections */
+    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    //XLink::AddParamToHead(&c, beta);
+    /* destroy variables */
+    delete[] dimSize;
+    DelTensorBuf(tmp);
+    return c;
+}
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+#ifndef __MULANDSHIFT_H__
+#define __MULANDSHIFT_H__
+#include "../../XTensor.h"
+#include "../CHeader.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __OPERATION_H__
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -99,11 +99,11 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
-    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    //CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
    for (int i = 0; i < index->order; i++)
@@ -111,9 +111,12 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
    onehot->SetZeroAll();
+    float confidence = 1 - labelSmoothingP;
+    float lowconfidence = labelSmoothingP / size;
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaIndexToOnehot(index, onehot, size);
+        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
    }
 #endif
@@ -122,12 +125,13 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
    int stride = size;
    int * indexData = (int *)index->data;
-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
-        int * od = onehotData + i * stride;
+        DTYPE * od = onehotData + i * stride;
-        od[id] = 1;
+        od[id] = 2;
+        //onehotData[i * stride + id] = 1;
    }
 }
@@ -138,9 +142,10 @@ make a new tensor to keep the result and return it
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
+>> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size)
+XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
@@ -151,9 +156,9 @@ XTensor IndexToOnehot(XTensor & index, int size)
    int * dim = new int[order + 1];
    memcpy(dim, index.dimSize, order * sizeof(int));
    dim[order] = size;
-    InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
+    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);
-    _IndexToOnehot(&index, &onehot, size);
+    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);
    delete[] dim;

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -96,7 +96,7 @@ convert index tensor to onehot tensor (kernel version)
 >> stride - stride of a data block
 */
 __global__
-void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
+void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int stride, float confidence, float lowconfidence)
 {
    /* block id */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -107,10 +107,17 @@ void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int st
    if (i >= blockNum || offset >= stride)
        return;
-    int * od = onehotData + i * stride;
+    DTYPE * od = onehotData + i * stride;
    int id = indexData[i];
-    od[id] = 1;
+    //od[id] = 2.0;
+    //onehotData[i * stride + id] = 0.1;
+    if (offset == id)
+        od[offset] = confidence;
+    else{
+        od[offset] = lowconfidence;
+    }
 }
 /* 
@@ -120,7 +127,7 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;
@@ -138,10 +145,10 @@ void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;
    int * indexData = (int *)index->data;
-    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride, confidence, lowconfidence);
    BacktoCudaDev(devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -34,11 +34,11 @@ make a new tensor to keep the result and return it */
 XTensor OnehotToIndex(XTensor & onehot, int num);
 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num);
+XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);
 } // namespace nts(NiuTrans.Tensor)