implement MulAndShift and bug fix

591d6121 · 姜雨帆 · 0b43acf6 · 591d6121 · 591d6121 · 591d6121
Commit 591d6121 authored Mar 13, 2019 by 姜雨帆
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -99,6 +99,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
+    else if (operID == MATH_MULANDSHIFT)
+        GradMulAndShift(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -1487,4 +1489,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }

+
+/*
+gradient for operation
+for c = matmul(x, w) + b 
+we have
+dE/dx = dE/dc * w^T
+dE/dw = x^T * dE/dc
+dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 3, "wrong input tensor number")
+
+    XTensor * x = income.tails[0];
+    XTensor * w = income.tails[1];
+    XTensor * b = income.tails[2];
+
+    int n = income.GetParamInt(0);
+    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
+    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
+
+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);
+
+    int order = node->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+
+    /* compute dE/db */
+    if (n == order - 1) {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = node->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+        size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(node->grad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for (int i = 0; i < order; i++) {
+            if (i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+        Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(interGrad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+
+        node->grad->Reshape(order, dimSize);
+
+        DelTensorBuf(interGrad);
+
+    }
+
+
+    /* compute dE/dx, dE/dw */
+    XTensor * c = node;
+    XTensor * dedc = node->grad;
+    XTensor * dedw = w->grad;
+    XTensor * dedx = x->grad;
+
+    if (x->order == 2 && w->order == 2)
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+        int orderBackupX = x->order;
+        int orderBackupC = c->order;
+        int dimsBackupX[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupX, x->dimSize, sizeof(int) * x->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+
+        x->Reshape(x->unitNum / x->GetDim(-1), x->GetDim(-1));
+        c->Reshape(c->unitNum / c->GetDim(-1), c->GetDim(-1));
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(dedx->unitNum / dedx->GetDim(-1), dedx->GetDim(-1));
+        dedc->Reshape(dedc->unitNum / dedc->GetDim(-1), dedc->GetDim(-1));
+
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+
+        x->Reshape(orderBackupX, dimsBackupX);
+        c->Reshape(orderBackupC, dimsBackupC);
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(orderBackupX, dimsBackupX);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+
+    }
+
+    node->visitMark = NODE_FINISHED;
+
+}
+
 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -168,6 +168,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };

 }

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -61,6 +61,7 @@ public:
    XTensor wa;
    
    XTensor wbig;
+	
    /* size of transformed Q and K */
    int dk;


--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -80,7 +80,6 @@ void AttDecoder::InitModel(int argc, char ** argv,
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];

-
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
@@ -89,9 +88,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-
    }
-
 }

 /* 

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

    x = embedder.Make(input);

-    //x.Dump(tmpFILE, "embedding: ");
-
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -160,4 +158,3 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 }

 }
-
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }



--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -219,7 +219,7 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
        
    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -236,10 +236,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
-    _Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
-    _Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
-    _ScaleAndShiftMe(maskEncDecTMPDec, 1e9F, -1e9F);
-    _Unsqueeze(maskEncDecTMPDec, &maskEncDec, 0, dims[0]);
+    //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
+    //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);

    DelTensorBuf(maskEncDecTMPDec);
    DelTensorBuf(maskEncDecTMPEnc);
@@ -274,10 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    _Sum(&maskEnc, padding3, &maskEnc);

    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
-    //encoding.Dump(stderr, "encoding",10);

    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
-    //decoding.Dump(stderr, "decoding", 10);
+
    outputLayer->Make(decoding, output);

    delete[] dims;

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -176,6 +176,9 @@ public:
    /* indicates whether we intend to debug the net */
    bool isDebugged;

+    /* bucket size */
+    int bucketSize;
+
 public:
    /* constructor */
    T2TTrainer();
@@ -205,10 +208,10 @@ public:
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batchEnc, XTensor * paddingEnc, 
                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
+                  XTensor * gold, XTensor * label,
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
+                  bool isSorted, int &ws, int &wCount,
                  int devID, XMem * mem, 
 				  bool isTraining);

@@ -216,7 +219,7 @@ public:
    int LoadBatchLM(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc,
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
                    int devID, XMem * mem, 
@@ -226,9 +229,9 @@ public:
    int LoadBatchMT(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc, 
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
+                    bool isSorted, int &ws, int &wCount,
                    int devID, XMem * mem, 
 					bool isTraining);


--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -37,8 +37,6 @@ int TransformerMain(int argc, const char ** argv)
    if(argc == 0)
        return 1;

-    fprintf(stderr, "%e\n", log(1e-8F));
-
    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){
        args[i] = new char[strlen(argv[i]) + 1];
@@ -67,9 +65,6 @@ int TransformerMain(int argc, const char ** argv)
    T2TModel model;
    model.InitModel(argc, args);
    
-    //if(strcmp(modelFN, ""))
-        //model.Read(modelFN);    
-
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -308,6 +308,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
 }

 /*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+
+    XList list(3);
+    list.Add(t1);
+    list.Add(t2);
+    list.Add(t3);
+
+    MakeLink(&list, h, id);
+}
+
+/* 
 create a hyper edge with a list of tensors and a output tensor 
 >> list - a list of input tensors
 >> h - head tensor

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -138,6 +138,10 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);

+    /* create a hyper edge with three input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
+
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
    void MakeLink(const XList * list, XTensor * h, int id);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,8 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -57,7 +57,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SIGN               MATH_MULANDSHIFT + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1614,17 +1614,11 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
        else if (dataType == X_INT) {
            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
            for(int i = beg; i < end; i++){
-                if((i%(dimSize[1]) == 0)&&(i!=0)) {
-                    fprintf(file, " \n");
-                }
                int f = ((int*)d)[i];
                if(i == beg)
                    fprintf(file, "%d", f);
                else
                    fprintf(file, " %d", f);
-                //if((i%(dimSize[1]-1) == 0)&&(i!=0)) {
-                    //fprintf(file, " \n");
-                //}
            }
        }
        else

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -44,6 +44,7 @@
 #include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"

 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "MulAndShift.h"
+#include "MatrixMul.h"
+#include "Sum.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
+>> a - a tensor
+>> b - another tensor for sum
+*/
+int GetSumIndex(const XTensor &a, const XTensor &b)
+{
+    if (a.order < b.order)
+        return -1;
+    if (XTensor::IsSameShaped(&a, &b))
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for (int i = 0; i < b.order; i++) {
+        if (b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if (hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+
+/*
+operation c = x * w + b  MulAndShift
+>> x - tensor x
+>> w - tensor w
+>> b - tensor b
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
+
+    int xn = x.dimSizeRDI[1];
+    int xm = x.dimSizeRDI[0];
+    int wn = w.dimSizeRDI[1];
+    int wm = w.dimSizeRDI[0];
+
+    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
+
+    int order = x.order + w.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < x.order; i++)
+        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+    for (int i = 2; i < w.order; i++)
+        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    dimSize[sub++] = xn;
+    dimSize[sub++] = wm;
+
+    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+
+    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+
+    /* call _MatrixMul function */
+    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
+
+    XTensor c(tmp);
+    c.SetTMPFlag();
+
+    int n = GetSumIndex(tmp, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(tmp, &b, &c);
+
+        // TODO!!
+        ShowNTErrors("TODO!");
+
+    }
+    else if (n >= 0 && n < tmp->order) {
+        /* call _SumDim function */
+        _SumDim(tmp, &b, &c, n);
+
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+
+    /* tensor connections */
+    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    //XLink::AddParamToHead(&c, beta);
+
+    /* destroy variables */
+    delete[] dimSize;
+    DelTensorBuf(tmp);
+
+    return c;
+
+}
+
+
+
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+
+#ifndef __MULANDSHIFT_H__
+#define __MULANDSHIFT_H__
+
+#include "../../XTensor.h"
+#include "../CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __OPERATION_H__
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -99,11 +99,11 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
-    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    //CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")

    for (int i = 0; i < index->order; i++)
@@ -111,9 +111,12 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)

    onehot->SetZeroAll();

+    float confidence = 1 - labelSmoothingP;
+    float lowconfidence = labelSmoothingP / size;
+
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaIndexToOnehot(index, onehot, size);
+        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
    }
 #endif
@@ -122,12 +125,13 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
    int stride = size;

    int * indexData = (int *)index->data;
-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;

    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
-        int * od = onehotData + i * stride;
-        od[id] = 1;
+        DTYPE * od = onehotData + i * stride;
+        od[id] = 2;
+        //onehotData[i * stride + id] = 1;
    }

 }
@@ -138,9 +142,10 @@ make a new tensor to keep the result and return it

 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
+>> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size)
+XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")

@@ -151,9 +156,9 @@ XTensor IndexToOnehot(XTensor & index, int size)
    int * dim = new int[order + 1];
    memcpy(dim, index.dimSize, order * sizeof(int));
    dim[order] = size;
-    InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
+    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);

-    _IndexToOnehot(&index, &onehot, size);
+    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);

    delete[] dim;


--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -96,7 +96,7 @@ convert index tensor to onehot tensor (kernel version)
 >> stride - stride of a data block
 */
 __global__
-void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
+void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int stride, float confidence, float lowconfidence)
 {
    /* block id */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -107,10 +107,17 @@ void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int st
    if (i >= blockNum || offset >= stride)
        return;

-    int * od = onehotData + i * stride;
+    DTYPE * od = onehotData + i * stride;

    int id = indexData[i];
-    od[id] = 1;
+
+    //od[id] = 2.0;
+    //onehotData[i * stride + id] = 0.1;
+    if (offset == id)
+        od[offset] = confidence;
+    else{
+        od[offset] = lowconfidence;
+    }
 }

 /* 
@@ -120,7 +127,7 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;

@@ -138,10 +145,10 @@ void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);

-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;
    int * indexData = (int *)index->data;

-    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride, confidence, lowconfidence);

    BacktoCudaDev(devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);

 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -34,11 +34,11 @@ make a new tensor to keep the result and return it */
 XTensor OnehotToIndex(XTensor & onehot, int num);

 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoothingP);

 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num);
+XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP);

 } // namespace nts(NiuTrans.Tensor)