Clean the codes

9a477f7d · linye · 44bf9fa6 · 9a477f7d · 9a477f7d · 9a477f7d
Commit 9a477f7d authored Jul 18, 2019 by linye
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
@@ -74,12 +74,12 @@ void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);
-    XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
+	XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
    _ConvertDataType(node->grad, tmp);
-    _SumMe(input->grad, tmp);
+	_SumMe(input->grad, tmp);
-    DelTensorBuf(tmp);
+	DelTensorBuf(tmp);
-    node->visitMark = NODE_FINISHED;
+	node->visitMark = NODE_FINISHED;
 }
 /* 
@@ -92,7 +92,7 @@ dE/da = IndexToOnehot(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
+void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
@@ -101,11 +101,20 @@ void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);
-    node->visitMark = NODE_FINISHED;
+	node->visitMark = NODE_FINISHED;
 }
-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
+/* 
+gradient computation for IndexToOnehot
+for
+b = IndexToOnehot(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
@@ -114,8 +123,7 @@ void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);
-    node->visitMark = NODE_FINISHED;
+	node->visitMark = NODE_FINISHED;
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */
 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@
 namespace nts{
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+        return;
+    }
+    gold = income.tails[1];
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+         leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+    node->visitMark = NODE_FINISHED;
+}
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)

--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,6 +35,14 @@ namespace nts{
 class XLossGrad
 {
 public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficient);
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
                 XTensor * dedy, XTensor * dedx, XTensor * padding,

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
+    else if(operID == MATH_SCALE)
+        GradScale(node, isEfficient);
+    else if(operID == MATH_DESCALE)
+        GradDescale(node, isEfficient);
+    else if(operID == MATH_SHIFT)
+        GradShift(node, isEfficient);
    else if(operID == MATH_SUB)
        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
@@ -99,6 +105,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
+    else if (operID == MATH_MULANDSHIFT)
+        GradMulAndShift(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -717,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0]; 
    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
-    _Multiply(node->grad, b, a->grad, 1.0F);
-    _Multiply(node->grad, a, b->grad, 1.0F);
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
+        _Multiply(node->grad, b, a->grad, 1.0F);
+    }
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        _Multiply(node->grad, a, b->grad, 1.0F);;
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -887,88 +901,8 @@ gradient for normalize
 */
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
-    ShowNTErrors("This is really a bad piece of code!!!");
+    ShowNTErrors("TODO!");
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
-    XTensor * input = income.tails[0];
-    XTensor * mean = income.tails[1];
-    XTensor * var = income.tails[2];
-    XTensor * a = income.tails[3];
-    XTensor * b = income.tails[4];
-    XTensor * c = NewTensor(var);
-    XTensor * d = NewTensor(a);
-    XTensor * e = NewTensor(a);
-    XTensor * f = NewTensor(a);
-    XTensor * g = NewTensor(a);
-    XTensor * h = NewTensor(a);
-    XTensor * i = NewTensor(a);
-    XTensor * j = NewTensor(a);
-    XTensor * k = NewTensor(var);
-    XTensor * p = NewTensor(var);
-    XTensor * q = NewTensor(var);
-    XTensor * r = NewTensor(a);
-    XTensor * x = NewTensor(mean);
-    XTensor * y = NewTensor(mean);
-    XTensor * z = NewTensor(mean);
-    DTYPE epsilon = income.GetParam(1);
-    int dim = income.GetParamInt(0);
-    int n = a->GetDim(dim);
-    XNoder::MakeGrad(input);
-    XNoder::MakeGrad(mean);
-    XNoder::MakeGrad(var);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* dEdinput */
-    _ScaleAndShift(var, c, 1.0F, epsilon);
-    _Unsqueeze(c, d, dim, n);
-    _Power(d, e, -0.5F);
-    _Multiply(a, e, f);
-    _Multiply(node->grad, f, input->grad, 1.0F);
-    /* dEdmean */
-    _ScaleAndShift(f, g, -1.0F);
-    _ReduceSum(g, x, dim);
-    _ReduceSum(node->grad, y, dim);
-    _Multiply(y, x, mean->grad, 1.0F);
-    /* dEdvar */
-    _Unsqueeze(mean, h, dim, n);
-    _Sub(input, h, i);
-    _Multiply(a, i, j);
-    _Power(var, k, -1.5F);
-    _ScaleAndShift(k, p, -0.5F);
-    _ReduceSum(j, z, dim);
-    _Multiply(z, p, q);
-    _Multiply(y, q, var->grad, 1.0F);
-    /* dEda */
-    _Multiply(i, e, r);
-    _Multiply(node->grad, r, a->grad, 1.0F);
-    /* dEdb */
-    _Sum(b->grad, node->grad, b->grad);
-    node->visitMark = NODE_FINISHED;
-    delete c;
-    delete d;
-    delete e;
-    delete f;
-    delete g;
-    delete h;
-    delete i;
-    delete j;
-    delete k;
-    delete p;
-    delete q;
-    delete r;
-    delete x;
-    delete y;
-    delete z;
 }
 /*
@@ -1029,6 +963,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 }
 /*
+gradient for Scale
+for
+c = a * scale
+we have
+dE/da = dE/dc * scale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradScale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALE!");
+    XTensor * a = income.tails[0];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad, scale);
+    node->visitMark = NODE_FINISHED;
+}
+/*
+gradient for Descale
+for
+c = a / descale
+we have
+dE/da = dE/dc / descale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for DESCALE!");
+    XTensor * a = income.tails[0];
+    DTYPE descale = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad, 1/descale);
+    node->visitMark = NODE_FINISHED;
+}
+/*
+gradient for Shift
+for
+c = a + shift
+we have
+dE/da = dE/dc
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SHIFT!");
+    XTensor * a = income.tails[0];
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad);
+    node->visitMark = NODE_FINISHED;
+}
+/*
 gradient for minus
 for
 c =  a - b * \beta
@@ -1487,4 +1497,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }
+/*
+gradient for operation
+for c = matmul(x, w) + b 
+we have
+dE/dx = dE/dc * w^T
+dE/dw = x^T * dE/dc
+dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 3, "wrong input tensor number")
+    XTensor * x = income.tails[0];
+    XTensor * w = income.tails[1];
+    XTensor * b = income.tails[2];
+    int n = income.GetParamInt(0);
+    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
+    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);
+    int order = node->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+    /* compute dE/db */
+    if (n == order - 1) {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = node->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+        size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(node->grad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+        node->grad->Reshape(order, dimSize);
+    }
+    else {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+        for (int i = 0; i < order; i++) {
+            if (i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+        Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+        _ReduceSum(node->grad, interGrad, 2);
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(interGrad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+        node->grad->Reshape(order, dimSize);
+        DelTensorBuf(interGrad);
+    }
+    /* compute dE/dx, dE/dw */
+    XTensor * c = node;
+    XTensor * dedc = node->grad;
+    XTensor * dedw = w->grad;
+    XTensor * dedx = x->grad;
+    if (x->order == 2 && w->order == 2)
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+        int orderBackupX = x->order;
+        int orderBackupC = c->order;
+        int dimsBackupX[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupX, x->dimSize, sizeof(int) * x->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+        x->Reshape(x->unitNum / x->GetDim(-1), x->GetDim(-1));
+        c->Reshape(c->unitNum / c->GetDim(-1), c->GetDim(-1));
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(dedx->unitNum / dedx->GetDim(-1), dedx->GetDim(-1));
+        dedc->Reshape(dedc->unitNum / dedc->GetDim(-1), dedc->GetDim(-1));
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+        x->Reshape(orderBackupX, dimsBackupX);
+        c->Reshape(orderBackupC, dimsBackupC);
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(orderBackupX, dimsBackupX);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+    }
+    node->visitMark = NODE_FINISHED;
+}
 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -130,6 +130,18 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);
+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);
@@ -168,6 +180,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };
 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -126,6 +128,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
    node->visitMark = NODE_FINISHED;
 }
+/*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    input->grad->Reshape(order1, dimSize1);
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+    input->grad->Reshape(order, dimSize);
+    node->visitMark = NODE_FINISHED;
+}
 /* 
 gradient for merge
 for 

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);
+    /* gradient computation for dropout with indexs */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XNET_H__
 #define __XNET_H__
@@ -111,6 +112,10 @@ struct XNet
    /* show network topology */
    void ShowNetwork(FILE * file, XTensor * node);
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };
 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -839,38 +839,14 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        InitModelTensor2D(s, batchSize, model.vSize, model);
        InitModelTensor2D(y, batchSize, model.vSize, model);
-        ///* s = h_last * w  */
+        /* s = h_last * w  */
-        //_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
+        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
-        XTensor h_last1;
-        h_last1 = ScaleAndShift(h_last, 100, 0);
-        XTensor w1;
-        w1 = ScaleAndShift(w, 100, 0);
-        XTensor int8H_last;
-        XTensor int8W;
-        int8H_last = ConvertDataType(h_last1, X_INT8);
-        int8W = ConvertDataType(w1, X_INT8);
-        XTensor s1;
-        InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
-        _MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);       
        XTensor b2D;
-        InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
+        InitTensor(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);
-        b2D = ScaleAndShift(b2D, 10000, 0);
+        _Sum(&s, &b2D, &s);
-        XTensor b2D1;
-        b2D1 = ConvertDataType(b2D, X_INT);
-        _Sum(&s1, &b2D1, &s1);
-        s = ConvertDataType(s1, X_FLOAT);
-        s = ScaleAndShift(s, 0.0001, 0);
        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
@@ -1224,6 +1200,7 @@ void Test(const char * test, const char * result, FNNModel &model)
    }
    fclose(file);
+    fclose(ofile);
    double elapsed = GetClockSec() - startT;

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -53,52 +53,15 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-//void T2TAttention::InitModel(int argc, char ** argv, 
+void T2TAttention::InitModel(int argc, char ** argv, 
-//                             bool myIsMasked, int myIgnored, 
+                             bool myIsMasked, int myIgnored, 
-//                             int myDevID, XMem * myMem)
+                             int myDevID, XMem * myMem)
-//{
-//    devID = myDevID;
-//    mem = myMem;
-//    isMasked = myIsMasked;
-//    ignored = myIgnored;
-//    
-//    float minmax = 0;
-//
-//    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-//    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
-//    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-//
-//    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-//    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-//    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-//    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-//    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
-//
-//    float scale = 1.0F;
-//    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
-//    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
-//    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-//    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
-//
-//    wk.SetDataRand(-finfoutk, finfoutk);
-//    wq.SetDataRand(-finfoutk, finfoutk);
-//    wv.SetDataRand(-finfoutv, finfoutv);
-//    wa.SetDataRand(-finfouta, finfouta);
-//    wbig.SetDataRand(-finfoutbig, finfoutbig);
-//}
-void T2TAttention::InitModel(int argc, char ** argv,
-    bool myIsMasked, int myIgnored,
-    int myDevID, XMem * myMem)
 {
    devID = myDevID;
    mem = myMem;
    isMasked = myIsMasked;
    ignored = myIgnored;
    float minmax = 0;
    LoadParamInt(argc, argv, "nhead", &nhead, 8);
@@ -108,17 +71,17 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-    InitTensor2D(&wk, d, dk, X_FLOAT16, devID, mem);
+    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem);
+    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem);
+    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-    InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale / (d + dk));
+    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
-    float finfoutv = (float)sqrt(6.0F * scale / (d + dv));
+    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
@@ -138,150 +101,95 @@ make the network
 >> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
-//{
-//    XTensor k2;
-//    XTensor q2;
-//    XTensor v2;
-//
-//    if (selfatt){
-//        
-//        XTensor con;
-//        XList split;
-//
-//        con = MMul(k, wbig);
-//
-//        int d1 = con.GetDim(0);
-//        int d2 = con.GetDim(1);
-//        int d3 = con.GetDim(2) / 3;
-//
-//        InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
-//        InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-//        InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
-//
-//        split.Add(&q2);
-//        split.Add(&k2);
-//        split.Add(&v2);
-//
-//        Split(con, split, 2, 3);
-//    }
-//
-//    else{
-//        /* linear transofmration before self-attention */
-//        k2 = MMul(k, wk);
-//        q2 = MMul(q, wq);
-//        v2 = MMul(v, wv);
-//    }
-//
-//    XTensor kheads;
-//    XTensor qheads;
-//    XTensor vheads;
-//
-//    /* multi head */
-//    kheads = Split(k2, k2.order - 1, nhead);
-//    qheads = Split(q2, q2.order - 1, nhead);
-//    vheads = Split(v2, v2.order - 1, nhead);
-//
-//    XTensor att;
-//    XTensor dot;
-//    XTensor scalar;
-//
-//    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-//    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-//
-//    if(isMasked)
-//        dot = dot + mask;
-//
-//    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-//
-//    scalar = Softmax(dot, -1);
-//    
-//    if(isTraining && dropoutP > 0)
-//        scalar = Dropout(scalar, dropoutP);
-//
-//    att = BMMul(scalar, vheads);
-//
-//    /* concatenate the heads */
-//    return MMul(Merge(att, att.order - 1), wa);
-//}
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
 {
-    XTensor halfK2;
+    XTensor k2;
-    XTensor halfQ2;
+    XTensor q2;
-    XTensor halfV2;
+    XTensor v2;
-    XTensor halfK;
+    /* linear transformation before self-attention */
-    halfK = ConvertDataType(k, X_FLOAT16);
+    k2 = MMul(k, wk);
+    q2 = MMul(q, wq);
-    if (selfatt) {
+    v2 = MMul(v, wv);
-        XTensor halfCon;
+    return MakeAttention(k2, q2, v2, mask, isTraining);
-        XList halfSplit;
+}
-        halfCon = MMul(halfK, wbig);
+/*
-        int d1 = halfCon.GetDim(0);
+make the network given a big tensor that keeps keys, queries and values
-        int d2 = halfCon.GetDim(1);
+>> kqv - the big tensor
-        int d3 = halfCon.GetDim(2) / 3;
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
-        InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem);
+*/
-        InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem);
+XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
-        InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem);
+{
+    XTensor k2;
-        halfSplit.Add(&halfQ2);
+    XTensor q2;
-        halfSplit.Add(&halfK2);
+    XTensor v2;
-        halfSplit.Add(&halfV2);
+    XTensor kqv2;
+    XList split;
-        Split(halfCon, halfSplit, 2, 3);
-    }
+    kqv2 = MMul(kqv, wbig);
-    else {
+    int d1 = kqv2.GetDim(0);
-        XTensor halfQ;
+    int d2 = kqv2.GetDim(1);
-        XTensor halfV;
+    int d3 = kqv2.GetDim(2) / 3;
-        halfQ = ConvertDataType(q, X_FLOAT16);
-        halfV = ConvertDataType(v, X_FLOAT16);
+    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-        /* linear transofmration before self-attention */
+    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
-        halfK2 = MMul(halfK, wk);
-        halfQ2 = MMul(halfQ, wq);
+    split.Add(&q2);
-        halfV2 = MMul(halfV, wv);
+    split.Add(&k2);
-    }
+    split.Add(&v2);
-    XTensor halfKheads;
+    Split(kqv2, split, 2, 3);
-    XTensor halfQheads;
-    XTensor halfVheads;
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys. It might be of size B * L * H
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries
+>> v - values
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
    /* multi head */
-    halfKheads = Split(halfK2, halfK2.order - 1, nhead);
+    kheads = Split(k, k.order - 1, nhead);
-    halfQheads = Split(halfQ2, halfQ2.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
-    halfVheads = Split(halfV2, halfV2.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
-    XTensor halfAtt;
+    XTensor att;
-    XTensor halfDot;
+    XTensor dot;
-    XTensor halfScalar;
+    XTensor scalar;
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS);
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-    //XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
+    if(isMasked)
+        dot = dot + mask;
-    if (isMasked) {
-        XTensor halfMask;
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-        halfMask = ConvertDataType(mask, X_FLOAT16);
-        halfDot = Sum(halfDot, halfMask);
+    scalar = Softmax(dot, -1);
-    }      
+    if(isTraining && dropoutP > 0)
-    halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead));
+        scalar = Dropout(scalar, dropoutP);
-    halfScalar = Softmax(halfDot, -1);
+    att = BMMul(scalar, vheads);
-    if (isTraining && dropoutP > 0)
-        halfScalar = Dropout(halfScalar, dropoutP);
-    halfAtt = BMMul(halfScalar, halfVheads);
    /* concatenate the heads */
-    return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT);
+    return MMul(Merge(att, att.order - 1), wa);
 }
 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -61,6 +61,7 @@ public:
    XTensor wa;
    XTensor wbig;
    /* size of transformed Q and K */
    int dk;
@@ -96,7 +97,13 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);
    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    /* make the network given a big tensor that keeps keys, queries and values */
+    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };
 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -80,7 +80,6 @@ void AttDecoder::InitModel(int argc, char ** argv,
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
@@ -89,9 +88,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
    }
 }
 /* 
@@ -122,7 +119,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining, true);
+        att = attentions[i].MakeBig(x, mask, isTraining);
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -136,7 +133,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);
        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -100,4 +100,4 @@ public:
 }
 #endif
\ No newline at end of file
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
    x = embedder.Make(input);
-    //x.Dump(tmpFILE, "embedding: ");
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -116,7 +114,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining, true);
+        att = attentions[i].MakeBig(x, mask, isTraining);
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -160,4 +158,3 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 }
 }
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;
    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);
    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -203,31 +203,49 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor maskEnc;
    XTensor maskDec;
    XTensor maskEncDec;
+    /* encoder mask */
+    MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc);
-    /* generate mask to see "previous" words on the decoder side */
+    /* decoder mask */
-    //int len = inputDec.GetDim(inputDec.order - 2);
+    MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec);
-    //int * dims = new int[inputDec.order + 1];
-    //for(int i = 0; i < inputDec.order; i++)
+    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
-    //    dims[i + 1] = inputDec.GetDim(i);
-    //dims[0] = nhead;
+    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
-    //dims[inputDec.order] = len;
-    //InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
+    outputLayer->Make(decoding, output);
+}
+/* 
+make the mask for training MT models 
+>> inputEnc - input of the encoder
+>> inputDec - input of the decoder
+>> paddingEnc - padding of the encoder input
+>> paddingDec - padding of the decoder input
+>> maskEnc - mask of the encoder self-attention
+>> maksDec - mask of the decoder self-attention
+>> maksEncDec - mask of the decoder enc-dec attention
+*/
+void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec, 
+                          XTensor &paddingEnc, XTensor &paddingDec, 
+                          XTensor &maskEnc,    XTensor &maskDec,    XTensor &maskEncDec)
+{
    int len = inputDec.GetDim(inputDec.order - 1);
    int * dims = new int[inputDec.order + 2];
    for(int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
-    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
       a given sequence. */
    _SetDataLowTri(&maskDec, 1e9F, 0);
    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
-    /* encoder-decoder mask that prevent the attention to padding dummy words */
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
@@ -236,8 +254,6 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
-    //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
-    //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
@@ -273,20 +289,98 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    /* generate the mask on the source language side (for padding) */
    _Sum(&maskEnc, padding3, &maskEnc);
-    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
-    //encoding.Dump(stderr, "encoding",10);
-    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
-    //decoding.Dump(stderr, "decoding", 10);
-    outputLayer->Make(decoding, output);
    delete[] dims;
    delete[] dimsPadding;
    DelTensorBuf(padding3);
    DelTensorBuf(padding2);
 }
+/*
+make the mask of the encoder
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder input
+>> maskEnc - mask of the encoder self-attention
+*/
+void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
+{
+    /* padding on the source side */
+    int * dimsPadding = new int[paddingEnc.order + 2];
+    for (int i = 0; i < paddingEnc.order - 1; i++)
+        dimsPadding[i] = paddingEnc.GetDim(i);
+    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
+    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
+    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    for (int i = 0; i < padding2->order; i++)
+        dimsPadding[i + 1] = padding2->GetDim(i);
+    dimsPadding[0] = nhead;
+    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    /* mask of the padding */
+    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+    _Unsqueeze(padding2, padding3, 0, nhead);
+    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    InitTensor(&maskEnc, padding3);
+    maskEnc.SetZeroAll();
+    /* generate the mask on the source language side (for padding) */
+    _Sum(&maskEnc, padding3, &maskEnc);
+    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
+    delete[] dimsPadding;
+}
+/*
+make the mask of the decoder
+>> inputEnc - input of the encoder
+>> inputDec - input of the decoder
+>> paddingEnc - padding of the encoder input
+>> paddingDec - padding of the decoder input
+>> maksDec - mask of the decoder self-attention
+>> maksEncDec - mask of the decoder enc-dec attention
+*/
+void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                             XTensor &paddingEnc, XTensor &paddingDec,
+                             XTensor &maskDec, XTensor &maskEncDec)
+{
+    int len = inputDec.GetDim(inputDec.order - 1);
+    int * dims = new int[inputDec.order + 2];
+    for(int i = 0; i < inputDec.order; i++)
+        dims[i + 1] = inputDec.GetDim(i);
+    dims[0] = nhead;
+    dims[inputDec.order + 1] = len;
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+     this matrix can be used to prevent the attention to current or following words in
+     a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
+    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
+    DelTensorBuf(maskEncDecTMPDec);
+    DelTensorBuf(maskEncDecTMPEnc);
+    delete[] dims;
+}
 /* 
 get parameter matrics
 >> list - the list that keeps the parameter matrics

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -31,6 +31,9 @@
 namespace transformer
 {
+/* a transformer model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). Also, it creates
+   the network used in transformer. */
 class T2TModel
 {
 public:
@@ -78,7 +81,21 @@ public:
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);
    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    /* make the mask for training MT models */
+    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
+                    XTensor &paddingEnc, XTensor &paddingDec, 
+                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
+    /* make the mask of the encoder */
+    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
+    /* make the mask of the decoder */
+    void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                       XTensor &paddingEnc, XTensor &paddingDec,
+                       XTensor &maskDec, XTensor &maskEncDec);
    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;
-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }
 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -176,6 +176,9 @@ public:
    /* indicates whether we intend to debug the net */
    bool isDebugged;
+    /* bucket size */
+    int bucketSize;
 public:
    /* constructor */
    T2TTrainer();
@@ -205,10 +208,10 @@ public:
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batchEnc, XTensor * paddingEnc, 
                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
+                  XTensor * gold, XTensor * label,
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
+                  bool isSorted, int &ws, int &wCount,
                  int devID, XMem * mem, 
 				  bool isTraining);
@@ -216,7 +219,7 @@ public:
    int LoadBatchLM(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc,
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
                    int devID, XMem * mem, 
@@ -226,9 +229,9 @@ public:
    int LoadBatchMT(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc, 
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
+                    bool isSorted, int &ws, int &wCount,
                    int devID, XMem * mem, 
 					bool isTraining);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -36,8 +36,6 @@ int TransformerMain(int argc, const char ** argv)
 {
    if(argc == 0)
        return 1;
-    fprintf(stderr, "%e\n", log(1e-8F));
    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){
@@ -65,19 +63,22 @@ int TransformerMain(int argc, const char ** argv)
    trainer.Init(argc, args);
    T2TModel model;
-    model.InitModel(argc, args);   
+    model.InitModel(argc, args);
+    //if(strcmp(modelFN, ""))
+    //    model.Read(modelFN);
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+        //model.Dump(modelFN);
    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
+    //if(strcmp(modelFN, ""))
-        model.Read(modelFN);
+        //model.Read(modelFN);
    T2TTrainer tester;
    tester.Init(argc, args);

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,6 +30,7 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./loss/CrossEntropy.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -47,15 +47,8 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
 extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
-inline unsigned short cal_complement(unsigned short sig, unsigned short tal);
-unsigned short Float16Add(unsigned short a, unsigned short b);
-unsigned short Float16Sub(unsigned short a, unsigned short b);
-unsigned short Float16Mul(unsigned short a, unsigned short b);
-unsigned short Float16Div(unsigned short a, unsigned short b);
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-unsigned short FloatbitsToHalfbits(float ff);
-float HalfbitsToFloatbits(unsigned short h);
 void ConvertDataType(int devID, 
                     void * s, TENSOR_DATA_TYPE typeS, 
                     void * t, TENSOR_DATA_TYPE typeT, int size);

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -266,6 +266,10 @@ XDevManager::XDevManager()
 {
    Clear();
    Init();
+#ifndef USE_CPP11
+    fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
+#endif
 }
 /* de-constructor */

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -43,13 +43,17 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {
+#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
+#define USE_CPP11
+#endif
 #define _XINLINE_  
 //#define DOUBELPRICSION
 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -307,6 +307,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    MakeLink(&list, h, id);
 }
+/*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+    XList list(3);
+    list.Add(t1);
+    list.Add(t2);
+    list.Add(t3);
+    MakeLink(&list, h, id);
+}
 /* 
 create a hyper edge with a list of tensors and a output tensor 
 >> list - a list of input tensors
@@ -509,6 +530,88 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
    }
 }
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -634,6 +737,29 @@ void XLink::ShowNode(FILE * file, XTensor * node)
    fprintf(stderr, "\n");
 }
+/* 
+search for a node in a top-down manner by its name 
+>> top - the top most node
+<< return - the node we found
+*/
+/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
+{
+    if(!strcmp(top->name, name))
+        return top;
+    XLink &incoming = top->income;
+    for(int i = 0; i < incoming.tailNum; i++){
+        XTensor * child = incoming.tails[i];
+        XTensor * hit = SearchNode(child, name);
+        if(hit != NULL)
+            return hit;
+    }
+    return NULL;
+}*/
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;
-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64
 /*
@@ -138,6 +138,10 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
+    /* create a hyper edge with two input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
    void MakeLink(const XList * list, XTensor * h, int id);
@@ -170,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);
+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);
@@ -181,6 +189,10 @@ struct XLink
    /* show a node */
    static
    void ShowNode(FILE * file, XTensor * node);
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,14 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)
@@ -100,23 +108,25 @@ const char * GetOPName(int type)
        else if (type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
    }
-    else if ((type & DATA_BASE) != 0) {
+	else if ((type & DATA_BASE) != 0) {
-        if (type == GETANDSET_CONVERTDATATYPE)
+		if (type == GETANDSET_CONVERTDATATYPE)
-            return "G_CONVERTDATATYPE";
+			return "G_CONVERTDATATYPE";
-        else if (type == GETANDSET_INDEXTOONEHOT)
+		else if (type == GETANDSET_INDEXTOONEHOT)
-            return "G_INDEXTOONEHOT";
+			return "G_INDEXTOONEHOT";
-        else if (type == GETANDSET_ONEHOTTOINDEX)
+		else if (type == GETANDSET_ONEHOTTOINDEX)
-            return "G_ONEHOTTOINDEX";
+			return "G_ONEHOTTOINDEX";
-    }
+		else if (type == GETANDSET_SELECT)
-    else if ((type & SHAPE_BASE) != 0) {
+			return "G_SELECT";
-        if (type == GETANDSET_SELECT)
+	}
-            return "G_SELECT";
+	else if ((type & SHAPE_BASE) != 0) {
-        else if (type == MOVEMENT_COPYINDEXED)
+        if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -158,6 +168,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -57,7 +57,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
@@ -84,8 +89,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1
-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -111,6 +117,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1
+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -48,7 +48,6 @@
 #include "core/math/ScaleAndShift.h"
 #include "core/getandset/SetData.h"
 #include "function/Identity.h"
-#include "core/getandset/ConvertDataType.h"
 #ifdef USE_CUDA
@@ -60,7 +59,6 @@
 #include "core/utilities/FlushToMem.cuh"
 #include "core/utilities/SetAscendingOrder.cuh"
 #endif
 /* the nts (NiuTrans.Tensor) namespace */
@@ -70,8 +68,6 @@ int tensorIDGlobal = 0;
 MUTEX_HANDLE tensorMutex;
 XTensor NULLTensor;
-#define RAND_MAX16 0xff
 /* generate a tensor id */
 int MakeTensorID()
 {
@@ -196,6 +192,36 @@ XTensor::XTensor(const XTensor &reference)
    isTmp  = reference.isTmp;
 }
+/* copy constructor (with right value reference) */
+#ifdef USE_CPP11
+XTensor::XTensor(const XTensor &&reference)
+{
+    Init();
+    SetDataPointer();
+    id = MakeTensorID();
+    ShallowCopy(reference);
+    data = NULL;
+    dataHost = NULL;
+    devID = reference.devID;
+    mem = reference.mem;
+    data = reference.data;
+    signature = reference.signature;
+    /* what we really want to do is "reference.data = NULL;"
+       As "reference" is constant, we cannot reset reference.data
+       here. So we save the ADDRESS of reference.data in
+       reference.dataP, and do this work by updating "*reference.dataP".
+       This is VERY trick and might not be the best solution :) */
+    *reference.dataP = NULL;
+    XLink::Replace(&reference, this);
+    isInit = true;
+    isTmp  = reference.isTmp;
+}
+#endif
 /* de-constructor */
 XTensor::~XTensor()
 {
@@ -215,7 +241,6 @@ XTensor::~XTensor()
        XLink::Replace(this, newTensor);
    }
    XLink::ClearOutgoing(this);
    XLink::ClearIncoming(this);
@@ -298,7 +323,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 /* overloading of the equal-sign */
 XTensor& XTensor::operator= (const XTensor& tensor)
 {
    /* we must make a hard copy of the tensor if it is the input
       of another node. */
    if(outgo.tailNum > 0){
@@ -373,50 +398,97 @@ XTensor& XTensor::operator= (const XTensor& tensor)
    return *this;
 }
+/* overloading of the equal-sign (with right value reference) */
+XTensor& XTensor::operator= (const XTensor&& tensor)
+{
+    /* we must make a hard copy of the tensor if it is the input
+       of another node. */
+    if(outgo.tailNum > 0){
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(dims, dimSize, order * sizeof(int));
+        dims[0] = -dims[0];
+        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        newTensor->SetTMPFlag();
+        newTensor->data = data;
+        newTensor->dataHost = dataHost;
+        newTensor->signature = tensor.signature;
+        XLink::Replace(this, newTensor);
+        XLink::ClearOutgoing(this);
+        XLink::ClearIncoming(this);
+        newTensor->ShallowCopy(this);
+        data = NULL;
+        dataHost = NULL;
+    }
+    DestroyData();
+    ShallowCopy(tensor);
+    isInit = true;
+    devID = tensor.devID;
+    mem  = tensor.mem;
+    data = tensor.data;
+    signature = tensor.signature;
+    /* what we really want to do is "reference.data = NULL;"
+       As "reference" is constant, we cannot reset reference.data
+       here. So we save the ADDRESS of reference.data in
+       reference.dataP, and do this work by updating "*reference.dataP".
+       This is VERY trick and might not be the best solution :) */
+    *tensor.dataP = NULL;
+    XLink::Replace(&tensor, this);
+    return *this;
+}
 /* overloading of the plus-sign */
-XTensor XTensor::operator+ (const XTensor& tensor)
+XTensor XTensor::operator+ (const XTensor& tensor) const
 {
    return Sum(*this, tensor);
 }
 /* overloading of the plus-sign */
-XTensor XTensor::operator+ (const DTYPE shift)
+XTensor XTensor::operator+ (const DTYPE shift) const 
 {
    return ScaleAndShift(*this, 1, shift);
 }
 /* overloading of the multiply-sign */
-XTensor XTensor::operator* (const XTensor& tensor)
+XTensor XTensor::operator* (const XTensor& tensor) const
 {
    return Multiply(*this, tensor);
 }
 /* overloading of the multiply-sign */
-XTensor XTensor::operator* (const DTYPE scale)
+XTensor XTensor::operator* (const DTYPE scale) const
 {
    return ScaleAndShift(*this, scale, 0);
 }
 /* overloading of the minus-sign */
-XTensor XTensor::operator- (const XTensor& tensor)
+XTensor XTensor::operator- (const XTensor& tensor) const
 {
    return Sub(*this, tensor);
 }
 /* overloading of the minus-sign */
-XTensor XTensor::operator- (const DTYPE shift)
+XTensor XTensor::operator- (const DTYPE shift) const
 {
    return ScaleAndShift(*this, 1, -shift);
 }
 /* overloading of the division-sign */
-XTensor XTensor::operator/ (const XTensor& tensor)
+XTensor XTensor::operator/ (const XTensor& tensor) const
 {
    return Div(*this, tensor);
 }
 /* overloading of the division-sign */
-XTensor XTensor::operator/ (const DTYPE scale)
+XTensor XTensor::operator/ (const DTYPE scale) const
 {
    return ScaleAndShift(*this, (DTYPE)1/scale, 0);
 }
@@ -426,7 +498,7 @@ linear transformation b = a * \scale + \shift
 >> scale - the slope
 >> shift - the intercept
 */
-XTensor XTensor::Lin(DTYPE scale, DTYPE shift)
+XTensor XTensor::Lin(DTYPE scale, DTYPE shift) const
 {
    return Linear(*this, scale, shift);
 }
@@ -462,6 +534,37 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
    return true;
 }
+bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
+{
+    if (a == NULL || b == NULL)
+        return false;
+    if ((a->order - 1) != b->order)
+        return false;
+    for (int i = 0; i < b->order; i++) {
+        if (i < dim) {
+            if (a->dimSize[i] != b->dimSize[i])
+                return false;
+        }
+        else if (i >= dim) {
+            if (a->dimSize[i+1] != b->dimSize[i])
+                return false;
+        }
+    }
+    if(a->dataType != b->dataType)
+        return false;
+    if(a->denseRatio != b->denseRatio)
+        return false;
+    if(a->isSparse != b->isSparse)
+        return false;
+    return true;
+}
 /* 
 judge whether the three matrices are in the same type and size 
 >> a - input tensor
@@ -712,15 +815,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
        d = new double[unitNum];
        for (int i = 0; i < unitNum; i++) {
            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
-		}
-    }
-    else if (dataType == X_FLOAT16) {  
-        unsigned short random;    
-        unsigned short ulower = FloatToFloat16(lower), uvariance = FloatToFloat16(variance);
-        d = new unsigned short[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            random = FloatToFloat16(rand() % RAND_MAX16 * 1.0 / RAND_MAX16);
-            *((unsigned short*)d + i) = Float16Add(ulower, Float16Mul(uvariance, random));
        }
    }
    else {
@@ -1634,17 +1728,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                    fprintf(file, " %d", f);
            }
        }
-        else if (dataType == X_FLOAT16) {
-            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for (int i = beg; i < end; i++) {
-                unsigned short f = ((unsigned short*)d)[i];
-                if (i == beg)
-                    fprintf(file, "%u", f);
-                else
-                    fprintf(file, " %u", f);
-            }
-        }
        else
            ShowNTErrors("TODO!");
    }
@@ -1681,22 +1764,9 @@ dump data to a file
 */
 void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
-    if (tensor->dataType == X_FLOAT)
+    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
-    {
+    _CopyValues(tensor, &a);
-        XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+    a.Dump(file, label, n, beg, verbose);
-        _CopyValues(tensor, &a);
-        a.Dump(file, label, n, beg, verbose);
-    }
-    else if (tensor->dataType == X_FLOAT16)
-    {
-        XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
-        _ConvertDataType(tensor, &a);
-        a.Dump(file, label, n, beg, verbose);
-    }
-    else 
-    {
-        ShowNTErrors("TO DO!");
-    }
 }
 /* 
@@ -1774,14 +1844,6 @@ void XTensor::Read(FILE * file, const char * label)
                }
            }
        }
-        else if (dataType == X_FLOAT16) {
-            for (int i = 0; i < unitNum; i++) {
-                unsigned short * f = ((unsigned short*)data) + i;
-                if (fscanf(file, "%u", f) < 1) {
-                    ShowNTErrors("Incorrect tensor format!");
-                }
-            }
-        }
        else {
            ShowNTErrors("TODO!");
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -189,6 +189,11 @@ public:
    /* copy constructor */
    XTensor(const XTensor &reference);
+    /* copy constructor (with right value reference) */
+    #ifdef USE_CPP11
+    XTensor(const XTensor &&reference);
+    #endif
    /* de-constructor */
    ~XTensor();
@@ -204,32 +209,37 @@ public:
    /* overloading of the equal-sign */
    XTensor& operator= (const XTensor &tensor);
+    /* overloading of the equal-sign (with right value reference) */
+    #ifdef USE_CPP11
+    XTensor& operator= (const XTensor &&tensor);
+    #endif
    /* overloading of the plus-sign */
-    XTensor  operator+ (const XTensor &tensor);
+    XTensor  operator+ (const XTensor &tensor) const;
    /* overloading of the plus-sign */
-    XTensor  operator+ (const DTYPE shift);
+    XTensor  operator+ (const DTYPE shift) const;
    /* overloading of the multiply-sign */
-    XTensor  operator* (const XTensor &tensor);
+    XTensor  operator* (const XTensor &tensor) const;
    /* overloading of the multiply-sign */
-    XTensor  operator* (const DTYPE scale);
+    XTensor  operator* (const DTYPE scale) const;
    /* overloading of the minus-sign */
-    XTensor  operator- (const XTensor &tensor);
+    XTensor  operator- (const XTensor &tensor) const;
    /* overloading of the minus-sign */
-    XTensor  operator- (const DTYPE shift);
+    XTensor  operator- (const DTYPE shift) const;
    /* overloading of the division-sign */
-    XTensor  operator/ (const XTensor &tensor);
+    XTensor  operator/ (const XTensor &tensor) const;
    /* overloading of the division-sign */
-    XTensor  operator/ (const DTYPE scale);
+    XTensor  operator/ (const DTYPE scale) const;
    /* linear transformation */
-    XTensor Lin(DTYPE scale, DTYPE shift = 0);
+    XTensor Lin(DTYPE scale, DTYPE shift = 0) const;
    /* judge whether the two matrices are in the same type and size */
    static
@@ -239,6 +249,10 @@ public:
    static
    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
+    /* judge whether b is the reduced shape of a ?? */
+    static
+    bool IsReduceShaped(const XTensor * a, const XTensor * b, int dim);
    /* set the size of each dimension */
    void SetDim(int * myDimSize);

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -28,6 +28,7 @@
 #include "arithmetic/Div.h"
 #include "arithmetic/DivDim.h"
+#include "arithmetic/Mask.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
@@ -44,12 +45,14 @@
 #include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"
 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"
+#include "math/Binary.h"
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -214,4 +214,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
    return c;
 }
+/*
+element-wise division of two tensors
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetDivDimIndex(a, b);
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+        /* call _Div function */
+        _Div(&a, &b, &c, 0, leadingDim);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -43,15 +43,6 @@ void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
        c[i] = a[i] / b[i];
 }
-__global__
-void KernelDivElementWiseHalf(__half * a, __half * b, __half * c, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        c[i] = a[i] / b[i];
-}
 /*
 division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
 >> a - data array a
@@ -69,18 +60,6 @@ void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
        c[i] = a[i] / b[i] + alpha * c[i];
 }
-__global__
-void KernelDivElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    __half alpha1 = __float2half(alpha);
-    if (i < size)
-        c[i] = a[i] / b[i] + alpha1 * c[i];
-#endif
-}
 /*
 division of two tensors in a element-wise manner c(i) = a(i)/b(i).
 Note that a and b can be of different sizes here, i.e.,
@@ -201,25 +180,6 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
                }
            }
        }
-        else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
-            int cudaGridSize[3];
-            int cudaBlockSize[3];
-            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
-                GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
-                if (alpha == 0)
-                    KernelDivElementWiseHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum);
-                else
-                    KernelDivElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+/*
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the element
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -162,5 +162,36 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    return c;
 }
+/*
+tensor division
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+>> requireLink - if add operation to network
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -52,6 +52,14 @@ i.e., a is divided with b by broadcasting
 we make a new tensor c to keep the result and return it
 */
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Mask.h"
+#include "Mask.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
+    //    "Unmatched tensors in addition!");
+    if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && mask->devID >= 0) ||
+                (a->devID >= 0 && mask->devID < 0) ||
+                (a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaMask(a, mask, c, alpha);
+        }
+        else
+            _CudaMask(a, mask, c, alpha);
+#endif
+    }
+    else {
+        if (!a->isSparse && !mask->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+            if (a->dataType == DEFAULT_DTYPE &&
+                mask->dataType == X_INT &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                int * maskp = (int*)mask->data;
+                DTYPE * cp = (DTYPE*)c->data;
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                        if (maskp[i + 1] == 0) {
+                            cp[i + 1] = alpha;
+                        }
+                        else {
+                            cp[i + 1] = ap[i + 1];
+                        }
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
+{
+    _Mask(a, mask, a, alpha);
+}
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    /* call _Sum function */
+    _Mask(&a, &mask, &c, alpha);
+    /* tensor connections */
+    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    //XLink::AddParamToHead(&c, alpha);
+    // TODO!!
+    ShowNTErrors("TODO!");
+    return c;
+}
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+mask entries of a given tensor (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> mask - mask matrix
+>> c - where we put masked a
+>> size - the size of a/b/c
+>> alpha - value
+*/
+__global__
+    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        if (mask[i] == 0) {
+            c[i] = alpha;
+        }
+        else {
+            c[i] = a[i];
+        }
+    }
+}
+/*
+mask entries of a given tensor (cuda version)
+>> a - a tensor
+>> mask - mask tensor
+>> c - where we put masked a
+>> alpha - value 
+*/
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
+    //    "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
+        "The tensors must be on the same!");
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+    if (!a->isSparse && !mask->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+        if (a->dataType == DEFAULT_DTYPE &&
+            mask->dataType == X_INT &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+    XDevice::SetGPUDevice(devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#ifndef __MASK_CUH__
+#define __MASK_CUH__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* mask entries of a given tensor (cuda version) */
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif // __MASK_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#ifndef __MASK_H__
+#define __MASK_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -59,10 +59,17 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
 XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
 /* matrix multiplication with no transposition c = a * b * alpha*/
-XTensor MatrixMul(const XTensor &a, const XTensor &b,
+XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
+    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -50,7 +50,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                  XPRunner * parallelRunner, XStream * stream)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
+    CheckNTErrors((a->dataType == b->dataType), "Input tensors should                have the same data type!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");
@@ -78,11 +78,19 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in multiplication!");
-        if (useBLAS)
+        if (a->dataType == DEFAULT_DTYPE &&
-            _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
+            b->dataType == DEFAULT_DTYPE &&
-        else
+            c->dataType == DEFAULT_DTYPE)
-            _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+        {
+            if (useBLAS)
+                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
+            else
+                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
    }
    /* a dense matrix multiply a sparse matrix */
    else if (!a->isSparse && b->isSparse) {

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -156,12 +156,18 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        if (stream != NULL)
            cublasSetStream(*handle, stream->stream);
-        _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType,
+        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-            b->data, transposedB, a->dataType, c->data, c->dataType,
+            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
-            a->dimSize[0], a->dimSize[1],
+                               b->data, transposedB, a->dataType, c->data, c->dataType,
-            b->dimSize[0], b->dimSize[1],
+                               a->dimSize[0], a->dimSize[1], 
-            c->dimSize[0], c->dimSize[1],
+                               b->dimSize[0], b->dimSize[1], 
-            alpha, beta);
+                               c->dimSize[0], c->dimSize[1],
+                               alpha, beta);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
    }
    /* a dense matrix multiply a sparse matrix */
    else if (!a->isSparse && b->isSparse) {

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "MulAndShift.h"
+#include "MatrixMul.h"
+#include "Sum.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
+>> a - a tensor
+>> b - another tensor for sum
+*/
+int GetSumIndex(const XTensor &a, const XTensor &b)
+{
+    if (a.order < b.order)
+        return -1;
+    if (XTensor::IsSameShaped(&a, &b))
+        return -1;
+    int hitCount = 0;
+    int hitDim = -1;
+    for (int i = 0; i < b.order; i++) {
+        if (b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+    if (hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+/*
+operation c = x * w + b  MulAndShift
+>> x - tensor x
+>> w - tensor w
+>> b - tensor b
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
+    int xn = x.dimSizeRDI[1];
+    int xm = x.dimSizeRDI[0];
+    int wn = w.dimSizeRDI[1];
+    int wm = w.dimSizeRDI[0];
+    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
+    int order = x.order + w.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < x.order; i++)
+        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+    for (int i = 2; i < w.order; i++)
+        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    dimSize[sub++] = xn;
+    dimSize[sub++] = wm;
+    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+    /* call _MatrixMul function */
+    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
+    XTensor c(tmp);
+    c.SetTMPFlag();
+    int n = GetSumIndex(tmp, b);
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(tmp, &b, &c);
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+    else if (n >= 0 && n < tmp->order) {
+        /* call _SumDim function */
+        _SumDim(tmp, &b, &c, n);
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+    /* tensor connections */
+    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    //XLink::AddParamToHead(&c, beta);
+    /* destroy variables */
+    delete[] dimSize;
+    DelTensorBuf(tmp);
+    return c;
+}
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+#ifndef __MULANDSHIFT_H__
+#define __MULANDSHIFT_H__
+#include "../../XTensor.h"
+#include "../CHeader.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __OPERATION_H__
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -215,4 +215,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    return c;
 }
+/*
+element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the index of the item
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetMultiplyDimIndex(a, b);
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+        /* call _Multiply function */
+        _Multiply(&a, &b, &c, 0, leadingDim);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -43,15 +43,6 @@ void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
        c[i] = a[i] * b[i];
 }
-__global__
-void KernelMulElementWiseHalf(__half * a, __half * b, __half * c, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        c[i] = a[i] * b[i];
-}
 /*
 multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i)
 >> a - data array a
@@ -69,18 +60,6 @@ void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
        c[i] = a[i] * b[i] + alpha * c[i];
 }
-__global__
-void KernelMulElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    __half alpha1 = __float2half(alpha);
-    if (i < size)
-        c[i] = a[i] * b[i] + alpha1 * c[i];
-#endif
-}
 /*
 multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i).
 Note that a and b can be of different sizes here, i.e.,
@@ -201,25 +180,6 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
                }
            }
        }
-        else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
-            int cudaGridSize[3];
-            int cudaBlockSize[3];
-            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
-                GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
-                if (alpha == 0)
-                    KernelMulElementWiseHalf << <blocks, threads >> >((__half *)a->data, (half *)b->data, (half *)c->data, c->unitNum);
-                else
-                    KernelMulElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -162,6 +162,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
    return c;
 }
+/*
+tensor multiplication
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> requireLink - if add operation to network
+*/
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, 0);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
 /* 
 tensor broadcast multiplication
 c = a * b + c * \beta 
@@ -302,4 +332,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
    return c;
 }
+/* 
+tensor broadcast multiplication
+c = a * b + c * \beta 
+where some of dimensions of b can be of size 1
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> requireLink - if add operation to network
+*/
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumBroadcast function */
+    _MultiplyBroadcast(&a, &b, &c, 0);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -22,10 +22,6 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "MultiplyDim.cuh"
-#include "../getandset/ConvertDataType.h"
-#include "../arithmetic/XTensorBLAS.h"
-#include "../math/ScaleAndShift.h"
-#include "cuda_fp16.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -65,25 +61,6 @@ void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
        c[offset] = a[offset] * bv[threadIdx.x];
 }
-__global__
-void KernelMultiplyWithRowHalf(__half * a, __half * b, __half * c, int rowNum, int colNum)
-{
-    __shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int col = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    if (col >= colNum || row >= rowNum)
-        return;
-    if (threadIdx.y == 0)
-        bv[threadIdx.x] = b[col];
-    __syncthreads();
-    int offset = colNum * row + col;
-    c[offset] = a[offset] * bv[threadIdx.x];
-}
 /*
 tensor multiplication of a tensor and a colum vector
 c = a * b + \alpha * c
@@ -125,30 +102,6 @@ void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int bloc
        c[offset] = a[offset] * bv[threadIdx.y];
 }
-__global__
-void KernelMultiplyWithColHalf(__half * a, __half * b, __half * c, int rowNum, int colNum, int blockSize, int blockNum)
-{
-    __shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-    int col = colIndex % colNum;
-    int block = colIndex / colNum;
-    if (row >= rowNum || block >= blockNum)
-        return;
-    if (threadIdx.x == 0)
-        bv[threadIdx.y] = b[row];
-    __syncthreads();
-    int offset = block * blockSize + row * colNum + col;
-    c[offset] = a[offset] * bv[threadIdx.y];
-}
 /*
 tensor multiplication
@@ -182,13 +135,14 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        else if (i < n)
            blockNum *= a->dimSize[i];
    }
    int cudaGrids[3];
    int cudaBlocks[3];
    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);   
+    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
        if (stride > 1) {
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
            if(alpha == (DTYPE)0.0F)
@@ -202,48 +156,23 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        }
        else if (stride == 1) {
            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (alpha == (DTYPE)0.0F)
+            if(alpha == (DTYPE)0.0F)
-                KernelMultiplyWithRow<DTYPE, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+                KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
-                    ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
+                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
-                        blockNum, blockSize, alpha);
+                                                       blockNum, blockSize, alpha);
            else
                KernelMultiplyWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
                                                       blockNum, blockSize, alpha);
        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-    }
-    else if (a->dataType == X_FLOAT16) {
-        if (stride > 1) {
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-            KernelMultiplyWithColHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((__half *)a->data, (__half *)b->data, (__half *)c->data,
-                blockSize, stride, blockSize * stride, blockNum);
-        }
-        else if (stride == 1) {
-            /*__half alpha1 = float2half_rn(alpha);*/
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            KernelMultiplyWithRowHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((__half *)a->data, (__half *)b->data, (__half *)c->data,
-                blockNum, blockSize);
-        }
        else {
            ShowNTErrors("Something is wrong!");
        }
    }
    else {
        ShowNTErrors("TODO!");
    }
    BacktoCudaDev(a->devID, devIDBackup);
 }

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
+/* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting */
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
   we return the resulting tensor here */
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
+/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MULTIPLYDIM_H__
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a)
    return b;
 }
+/*
+set every entry to its minus value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Negate(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Negate function */
+    _Negate(&a, &b);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Negate(const XTensor & a);
+/* set every entry to its minus value */
+void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a)
    return b;
 }
+/*
+set every entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Sign(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Sign function */
+    _Sign(&a, &b);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -23,7 +23,6 @@
 #include "../../XTensor.h"
 #include "Sign.h"
 #include "Sign.cuh"
-#include "cuda_fp16.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -57,25 +56,9 @@ This is for float16 computation
 >> size - size of the data array
 */
 __global__
-void KernelSignHalf(__half * a, __half * b, int size)
+void KernelSign(__half * a, __half * b, int size)
 {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
+    return;
-	__half zero = __float2half(0.0F);
-	__half one = __float2half(1.0F);
-	__half one_1 = __float2half(-1.0F);
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-	DTYPE flag = __half2float(a[i]);
-	if (i < size) {
-		if (flag > 0)
-			b[i] = one;
-		else if (flag < 0)
-			b[i] = one_1;
-		else
-			b[i] = zero;
-	}
-#endif
 }
 /*
@@ -103,7 +86,7 @@ void _CudaSign(const XTensor * a, XTensor * b)
        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelSignHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Sign(const XTensor & a);
+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -194,4 +194,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/*
+tensor subtraction c = a - b * \beta
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetSubDimIndex(a, b);
+    if (n == -1) {
+        /* call _Sub function */
+        _Sub(&a, &b, &c, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUB);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SubDim function */
+        _SubDim(&a, &b, &c, n, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
@@ -22,8 +22,6 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "Sub.cuh"
-#include "cuda_fp16.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -48,30 +46,6 @@ void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
 }
 /*
-subtraction of data arrays (CUDA Kernel) Half Precision
-c = a - b * \beta
->> a - A matrix
->> b - another matrix
->> c - where we put a-b
->> size - the size of a/b/c
->> beta - the coefficient
-*/
-__global__
-void KernelSUBHalf(half * a, half * b, half * c, int size, DTYPE beta)
-{
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-	__half beta1 = __float2half(beta);
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-	if (i < size)
-		c[i] = a[i] - b[i] * beta1;
-#endif
-}
-/*
 tensor subtraction c = a - b * \beta (cuda version)
 >> a - a tensor
 >> b - another tensor
@@ -105,22 +79,10 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
            dim3 threads(blockSize[0]);
            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
        }
-        else if(a->dataType == X_FLOAT16 &&
+        else {
-				b->dataType == X_FLOAT16 &&
+            // TODO!!
-				c->dataType == X_FLOAT16){
+            ShowNTErrors("TODO!");
-			int gridSize[3], blockSize[3];
-			GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-			dim3 blocks(gridSize[0]);
-			dim3 threads(blockSize[0]);
-			KernelSUBHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, a->unitNum, beta);
        }
-		else {
-			// TODO!!
-			ShowNTErrors("TODO!");
-		}
    }
    else {
        // TODO!!

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor subtraction c = a - b * \beta */
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUB_H__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -163,4 +163,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 	return c;
 }
+/*
+tensor subtraction
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Sub function */
+    _SubDim(&a, &b, &c, n, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -21,9 +21,6 @@
 #include "SubDim.cuh"
 #include "../../XDevice.h"
-#include "cuda_fp16.h"
-#include "device_launch_parameters.h"
-#include "../../XDataType.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -40,10 +37,11 @@ where a is a tensor and b is a row vector
 >> colNum - number of columns of a and c (i.e., the size of b)
 >> beta - the scaling factor
 */
+template <class T, bool betaFired>
 __global__
-void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired)
+	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
-	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -62,59 +60,6 @@ void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, D
 		c[offset] = a[offset] - bv[threadIdx.x];
 }
-__global__
-void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
-{
-	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
-	if (col >= colNum || row >= rowNum)
-		return;
-	if (threadIdx.y == 0)
-		bv[threadIdx.x] = b[col];
-	__syncthreads();
-	int offset = colNum * row + col;
-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.x] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.x];
-}
-//template <class T, bool betaFired>
-//__global__
-//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
-//{
-//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-//	int col = blockDim.x * blockIdx.x + threadIdx.x;
-//	int row = blockDim.y * blockIdx.y + threadIdx.y;
-//
-//	if (col >= colNum || row >= rowNum)
-//		return;
-//
-//	if (threadIdx.y == 0)
-//		bv[threadIdx.x] = b[col];
-//
-//	__syncthreads();
-//
-//	T beta1;
-//	if (sizeof(T) - sizeof(half) == 0) {
-//		beta1 =__float2half(beta);
-//	}
-//	else {
-//		beta1 = beta;
-//	}
-//
-//	int offset = colNum * row + col;
-//	if (betaFired)
-//		c[offset] = a[offset] - bv[threadIdx.x] * beta1;
-//	else
-//		c[offset] = a[offset] - bv[threadIdx.x];
-//}
 /*
 tensor subtraction of a tensor and a colum vector
 c = a - b * \beta
@@ -128,11 +73,11 @@ where a is a tensor and b is a colum vector
 >> blockNum - number of matrics
 >> beta - the scaling factor
 */
+template <class T, bool betaFired>
 __global__
-void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired)
+	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
-	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -156,71 +101,6 @@ void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, i
 		c[offset] = a[offset] - bv[threadIdx.y];
 }
-__global__
-void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
-{
-	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
-	int col = colIndex % colNum;
-	int block = colIndex / colNum;
-	if (row >= rowNum || block >= blockNum)
-		return;
-	if (threadIdx.x == 0)
-		bv[threadIdx.y] = b[row];
-	__syncthreads();
-	int offset = block * blockSize + row * colNum + col;
-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.y] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.y];
-}
-//
-//template <class T, bool betaFired>
-//__global__
-//	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
-//{
-//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-//
-//	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-//	int row = blockDim.y * blockIdx.y + threadIdx.y;
-//
-//	int col = colIndex % colNum;
-//	int block = colIndex / colNum;
-//
-//	if (row >= rowNum || block >= blockNum)
-//		return;
-//
-//	if (threadIdx.x == 0)
-//		bv[threadIdx.y] = b[row];
-//
-//	__syncthreads();
-//
-//	T beta1;
-//
-//	if (sizeof(T) - sizeof(half) == 0) {
-//		beta1 = __float2half(beta);
-//	}
-//	else {
-//		beta1 = beta;
-//	}
-//
-//	int offset = block * blockSize + row * colNum + col;
-//
-//	if (betaFired)
-//		c[offset] = a[offset] - bv[threadIdx.y] * beta1;
-//	else
-//		c[offset] = a[offset] - bv[threadIdx.y];
-//}
 /*
 tensor subtraction (cuda version)
@@ -265,72 +145,28 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta,false);
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
 			else
-				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta,true);
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta,false);
+					                              blockNum, blockSize, beta);
 			else
-				KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta,true);
+					                              blockNum, blockSize, beta);
-		}
-		else {
-			ShowNTErrors("Something is wrong!");
-		}
-	}
-	else if (a->dataType == X_FLOAT16) {
-		if (stride > 1) {
-			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F){
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockSize, stride, blockSize * stride, blockNum, beta1, false);
-			}
-			else {
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockSize, stride, blockSize * stride, blockNum, beta1, true);
-			}
-		}
-		else if (stride == 1) {
-			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F) {
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockNum, blockSize, beta1, false);
-			}
-			else{
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockNum, blockSize, beta1, true);
-			}
 		}
 		else {
 			ShowNTErrors("Something is wrong!");
 		}
 	}
 	else {
 		ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -21,7 +21,6 @@
 #include "../../XTensor.h"
 #include "../../XName.h"
-#include "../getandset/ConvertDataType.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
 #include "Sum.h"
@@ -38,58 +37,15 @@ tensor summation c = a + b * \beta
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
-                  "Unmatched tensors in addition!");
-    if (beta == 0) {
-        _CopyValues(a, c);
-        return;
-    }
-    XTensor b1(b->order, b->dimSize, a->dataType, b->denseRatio, b->devID, b->mem);
-    b1.SetTMPFlag();
-    _ConvertDataType(b, &b1);
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        if (a == c) {
-            int P2PAccesible = 0;
-#ifdef CUDA_UVA
-            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
-#endif
-            if ((a->devID < 0 && b->devID >= 0) ||
-                (a->devID >= 0 && b->devID < 0) ||
-                (a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
-            {
-                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
-            }
-            else
-                _CudaSum(a, &b1, c, beta);
-        }
-        else
-            _CudaSum(a, &b1, c, beta);
-#endif
-    }
-    else { 
-        // TODO!!
-        ShowNTErrors("TODO!");
-    }
-}
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
-        "Unmatched tensors in addition!");
+                  "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched tensors in addition!");
-    if (beta == 0) {
+    if(beta == 0){
        _CopyValues(a, c);
        return;
    }
@@ -118,7 +74,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
    else {
        if (!a->isSparse && !b->isSparse) {
            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
            if (a->dataType == DEFAULT_DTYPE &&
                b->dataType == DEFAULT_DTYPE &&
                c->dataType == DEFAULT_DTYPE)
@@ -126,7 +82,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                DTYPE * ap = (DTYPE*)a->data;
                DTYPE * bp = (DTYPE*)b->data;
                DTYPE * cp = (DTYPE*)c->data;
                /* unrolling */
                int num = a->unitNum;
                if (num % 4 == 0) {
@@ -243,4 +199,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/*
+tensor summation c = a + b * \beta
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetSumDimIndex(a, b);
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SumDim function */
+        _SumDim(&a, &b, &c, n, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -23,7 +23,6 @@
 #include "../../XUtility.h"
 #include "Sum.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
@@ -46,31 +45,6 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
        c[i] = a[i] + b[i] * beta;
 }
-__global__
-void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    __half beta1 = __float2half(beta);
-    if (i < size)
-        c[i] = a[i] + b[i] * beta1;
-#endif
-}
-__global__
-void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size)
-        c[i] = a[i] + b[i] * (int)beta;
-}
 /*
 tensor summation c = a + b * \beta (cuda version)
 >> a - a tensor
@@ -126,36 +100,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
            }
        }
-        else if (a->dataType == X_FLOAT16 &&
-                 b->dataType == X_FLOAT16 &&
-                 c->dataType == X_FLOAT16)
-        {
-            int gridSize[3], blockSize[3];
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            //KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-            KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
-        }
-        else if (a->dataType == X_INT &&
-                 b->dataType == X_INT &&
-                 c->dataType == X_INT)
-        {
-            int gridSize[3], blockSize[3];
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-            //KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-            KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -27,8 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* tensor summation c = a + b * \beta */
-void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 /* 
@@ -43,6 +41,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor summation c = a + b * \beta */
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -64,6 +64,20 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }
+    /*int dims[MAX_TENSOR_DIM_NUM];
+    for(int i = 0; i < a->order; i++)
+        dims[i] = 1;
+    dims[n] = a->GetDim(n);
+    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
+    _CopyValues(b, b2);
+    _SumBroadcast(a, b2, c, beta);
+    DelTensor(b2);
+    return;*/
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);
@@ -167,6 +181,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
    return c;
 }
+/*
+tensor summation 
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumDim function */
+    _SumDim(&a, &b, &c, n, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
 /* 
 tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
 c = a + b * \beta
@@ -307,5 +352,31 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumBroadcast function */
+    _SumBroadcast(&a, &b, &c, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -42,12 +42,19 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting */
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1.
   we return the resulting tensor here */
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -23,7 +23,6 @@
 #include "../../XDevice.h"
 #include "../../XTensor.h"
 #include "XTensorBLAS.h"
-#include <stdint.h>
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -81,38 +80,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
        else if (transposedA == X_TRANS && transposedB == X_TRANS)
            cublasHgemm(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, alpha3, (const __half*)b, mb, (const __half*)a, ma, beta3, (__half*)c, mc);
    }
-    else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
-        float alpha2 = (float)alpha;
-        float beta2 = (float)beta;
-        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_TRANS && transposedB == X_TRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (float*)c, CUDA_R_32F, mc, CUDA_R_32F, CUBLAS_GEMM_DEFAULT);
-    }
-    else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_INT) {
-        int alpha2 = (int)alpha;
-        int beta2 = (int)beta;
-        /*
-        CUDA requires that the dimension of two tensor( lda, ldb ) should be multiples of 4.
-        details in https://devtalk.nvidia.com/default/topic/999101/about-cublasgemm-int8-support/
-        */
-        if (mb % 4 != 0 || ma % 4 != 0) {
-            ShowNTErrors("mb, ma( lda, ldb ) should be multiples of 4!");
-            return;
-        }
-        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
-        else if (transposedA == X_TRANS && transposedB == X_TRANS)
-            cublasGemmEx(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, (const int8_t*)a, CUDA_R_8I, ma, &beta2, (int*)c, CUDA_R_32I, mc, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
-    }
    else {
        ShowNTErrors("Unsupported data type!");
    }
@@ -160,7 +127,7 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
        unsigned short alpha2 = FloatToFloat16(alpha);
        unsigned short beta2 = FloatToFloat16(beta);
        __half * alpha3 = (__half*)&alpha2;
-        __half * beta3 = (__half*)&beta2;      
+        __half * beta3 = (__half*)&beta2;
        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
            cublasHgemmBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, alpha3, (const __half**)b, mb, (const __half**)a, ma, beta3, (__half**)c, mc, count);
        else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
@@ -217,19 +184,13 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
        __half * alpha3 = (__half*)&alpha2;
        __half * beta3 = (__half*)&beta2;
        if (transposedA == X_NOTRANS && transposedB == X_NOTRANS)
-        cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
+            cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
        else if (transposedA == X_TRANS && transposedB == X_NOTRANS)
-        cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_T, mc, nc, na, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
+            cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
        else if (transposedA == X_NOTRANS && transposedB == X_TRANS)
-        cublasHgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
+            cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
        else if (transposedA == X_TRANS && transposedB == X_TRANS)
-        cublasHgemmStridedBatched(*handle, CUBLAS_OP_T, CUBLAS_OP_T, mc, nc, na, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);        
+            cublasHgemmStridedBatched(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, (const __half*)alpha3, (const __half*)b, mb, strideB, (const __half*)a, ma, strideA, (const __half*)beta3, (__half*)c, mc, strideC, count);
-    }
-    else if (dataTypeA == X_INT8 && dataTypeB == X_INT8 && dataTypeC == X_FLOAT) {
-        //float alpha2 = (float)alpha;
-        //float beta2 = (float)beta;
-        //cublasGemmStridedBatchedEx(*handle, CUBLAS_OP_N, CUBLAS_OP_N, mc, nc, ma, &alpha2, (const int8_t*)b, CUDA_R_8I, mb, strideB, (const int8_t*)a, CUDA_R_8I, ma, strideA, &beta2, (float*)c, CUDA_R_32I, mc, strideC, count, CUDA_R_32I, CUBLAS_GEMM_DEFAULT);
-        ShowNTErrors("TO DO!");
    }
    else {
        ShowNTErrors("Unsupported data type!");

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -35,10 +35,8 @@ convert data type
 */
 void _ConvertDataType(const XTensor * input, XTensor * output)
 {
-    if (input->dataType == output->dataType) {
+    if (input->dataType == output->dataType)
-        _CopyValues(input, output);
+        return;
-        return ;
-    }
 #ifdef USE_CUDA
    /* run it on GPUs */

--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #include <cuda_runtime.h>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
-#include <stdint.h>
 __global__ 
 void KernelFloatToFloat16(float * s, __half * t, int size)
@@ -61,16 +60,6 @@ void KernelFloatToInt(float * inputData, int * outputData, int size)
    }
 }
-__global__
-void KernelFloatToInt8(float * inputData, int8_t * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (int8_t)(inputData[i]);
-    }
-}
 __global__ 
 void KernelIntToFloat(int * inputData, float * outputData, int size)
 {
@@ -78,18 +67,7 @@ void KernelIntToFloat(int * inputData, float * outputData, int size)
    if (i < size){
        outputData[i] = (float)(inputData[i]);
-    }
+    }}
-}
-__global__
-void KernelIntToFloat8(int8_t * inputData, float * outputData, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        outputData[i] = (float)(inputData[i]);
-    }
-}
 /* 
 data conversion (cuda code) 
@@ -145,6 +123,7 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
    int blockSize[3];
    GDevs.GetCudaThread(input->devID, input->unitNum, gridSize, blockSize);
    dim3 blocks(gridSize[0]);
    dim3 threads(blockSize[0]);
@@ -155,14 +134,10 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
        KernelFloatToInt<<<blocks, threads>>>((float*)input->data, (int*)output->data, input->unitNum);
    else if(input->dataType == X_INT && output->dataType == X_FLOAT)
        KernelIntToFloat<<<blocks, threads>>>((int*)input->data, (float*)output->data, input->unitNum);
-    else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
+    else if(input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
-        KernelFloatToFloat16 << <blocks, threads >> >((float*)input->data, (__half*)output->data, input->unitNum);   
+        KernelFloatToFloat16<<<blocks, threads>>>((float*)input->data, (__half*)output->data, input->unitNum);
-    else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
+    else if(input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
-        KernelFloat16ToFloat << <blocks, threads >> >((__half*)input->data, (float*)output->data, input->unitNum); 
+        KernelFloat16ToFloat<<<blocks, threads>>>((__half*)input->data, (float*)output->data, input->unitNum);
-    else if (input->dataType == X_FLOAT && output->dataType == X_INT8)
-        KernelFloatToInt8 << <blocks, threads >> >((float*)input->data, (int8_t*)output->data, input->unitNum);
-    else if (input->dataType == X_INT8 && output->dataType == X_FLOAT)
-        KernelIntToFloat8 << <blocks, threads >> >((int8_t*)input->data, (float*)output->data, input->unitNum);
    else{
        ShowNTErrors("Unsupported data types for conversion!");
    }
@@ -170,91 +145,6 @@ void _CudaConvertDataType(const XTensor * input, XTensor * output)
    ProtectCudaDev(input->devID, devIDBackup);
 }
-__global__
-void KernelConvertf32tof16(float* input, __half *output, int stride, int strideNum, int blockNum, int size)
-{
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int idy = blockDim.y * blockIdx.y + threadIdx.y;
-    int blockIndex = idy / stride;
-    int offsetInBlock = idy% stride;
-#pragma unroll
-    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
-        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
-        i += stride * blockDim.x) {
-        output[i] = __float2half(input[i]);
-    }
-}
-__global__
-void KernelConvertf16tof32(__half* input, float *output, int stride, int strideNum, int blockNum, int size)
-{
-    int idx = blockDim.x * blockIdx.x + threadIdx.x;
-    int idy = blockDim.y * blockIdx.y + threadIdx.y;
-    int blockIndex = idy / stride;
-    int offsetInBlock = idy% stride;
-#pragma unroll
-    for (int i = idx * stride + stride * strideNum * blockIndex + offsetInBlock;
-        i < stride * strideNum * blockIndex + offsetInBlock + stride * strideNum && i < size;
-        i += stride * blockDim.x) {
-        output[i] = __half2float(input[i]);
-    }
-}
-/*
-convert data type (cuda code)
->> input - input tensor
->> output - output tensor
-*/
-void _CudaConvertDataTypeNew(const XTensor * input, XTensor * output)
-{
-    //CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
-    if (input->dataType == output->dataType)
-        return;
-    //int dimRDI = input->order - 1;
-    int dimRDI = 0;
-    int stride = 1;
-    int strideNumA = input->dimSizeRDI[dimRDI];
-    for (int i = 0; i < dimRDI; i++)
-        stride *= input->dimSizeRDI[i];
-    int blockNum = 1;
-    for (int i = dimRDI + 1; i < input->order; i++)
-        blockNum *= input->dimSizeRDI[i];
-    int gridSize[3];
-    int blockSize[3];
-    /*for (int i = 0; i < input->order; ++i)
-    {
-        printf("%d ", input->dimSizeRDI[i]);
-    }
-    printf("\n");
-    printf("%d %d\n", dimRDI, input->dimSizeRDI[dimRDI]);
-    printf("%d %d %d\n", stride, strideNumA, blockNum);*/
-    int workerNum = 64;
-    GDevs.GetCudaThread2D(input->devID, workerNum, stride * blockNum, MAX_INT, gridSize, blockSize);
-    dim3 blocks(gridSize[0], gridSize[1]);
-    dim3 threads(blockSize[0], blockSize[1]);
-    int devIDBackup;
-    ProtectCudaDev(input->devID, devIDBackup);
-    if (input->dataType == X_FLOAT && output->dataType == X_INT)
-        KernelFloatToInt << <blocks, threads >> > ((float*)input->data, (int*)output->data, input->unitNum);
-    else if (input->dataType == X_INT && output->dataType == X_FLOAT)
-        KernelIntToFloat << <blocks, threads >> > ((int*)input->data, (float*)output->data, input->unitNum);
-    else if (input->dataType == X_FLOAT && output->dataType == X_FLOAT16)
-        KernelConvertf32tof16 << <blocks, threads >> > ((float*)input->data, (__half*)output->data, stride, strideNumA, blockNum, input->unitNum);
-    else if (input->dataType == X_FLOAT16 && output->dataType == X_FLOAT)
-        KernelConvertf16tof32 << <blocks, threads >> >((__half*)input->data, (float*)output->data, stride, strideNumA, blockNum, input->unitNum);
-    else {
-        ShowNTErrors("Unsupported data types for conversion!");
-    }
-    ProtectCudaDev(input->devID, devIDBackup);
-}
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
@@ -47,9 +47,6 @@ void KernelIntToFloat(int * inputData, float * outputData, int size);
 /* convert data type */
 void _CudaConvertDataType(const XTensor * input, XTensor * output);
-/* convert data type */
-void _CudaConvertDataTypeNew(const XTensor * input, XTensor * output);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -21,6 +21,7 @@
 #include "OnehotAndIndex.h"
 #include "OnehotAndIndex.cuh"
+#include "SetData.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -31,43 +32,65 @@ convert onehot tensor to index tensor
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
 */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size)
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim)
 {
-    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
+    dim = (dim < 0 ? onehot->GetDim(-1) : dim);
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
+    CheckNTErrors(dim < onehot->order, "Illegal speficied dimension!")
    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
-    for (int i = 0; i < index->order; i++)
+    for (int i = 0; i < index->order; i++) {
-        CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+        if (i < dim) {
+            CheckNTErrors(index->GetDim(i) == onehot->GetDim(i), "Illegal tensor order!");
+        }
+        else {
+            CheckNTErrors(index->GetDim(i) == onehot->GetDim(i + 1), "Illegal tensor order!");
+        }
+    }
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaOnehotToIndex(onehot, index, size);
+        _CudaOnehotToIndex(onehot, index, dim);
        return;
    }
 #endif
-    int blockNum = index->unitNum;
+    int blockNum = 1;
-    int stride = size;
+    int blockSize = 1;
+    int dimSize = 1;
+    int stride = 1;
+    for (int i = 0; i < dim; i++)
+        blockNum *= onehot->GetDim(i);
+    blockSize = onehot->unitNum / blockNum;
+    dimSize = onehot->GetDim(dim);
+    for (int i = dim + 1; i < onehot->order; i++)
+        stride *= onehot->GetDim(i);
    int * onehotData = (int *)onehot->data;
    int * indexData = (int *)index->data;
    for (int i = 0; i < blockNum; i++) {
-        int * od = onehotData + i * stride;
-        int record = -1;
        for (int j = 0; j < stride; j++) {
-            if (od[j] != 0) {
+            int * od = onehotData + i * blockSize + j;
-                if (record == -1)
+            int * index = indexData + i * stride + j;
-                    record = j;
-                else
+            int record = -1;
-                    ShowNTErrors("The value of onehot tensor is illegal!");
+            for (int j = 0; j < dimSize; j++) {
+                if (od[j*stride] != 0) {
+                    if (record == -1)
+                        record = j;
+                    else
+                        ShowNTErrors("The value of onehot tensor is illegal!");
+                }
            }
+            *index = record;
        }
-        indexData[i] = record;
    }
 }
 /* 
@@ -99,11 +122,11 @@ convert index tensor to onehot tensor
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP)
 {
    CheckNTErrors(onehot->GetDim(-1) == size, "Illegal tensor dimension!");
    CheckNTErrors(onehot->order == index->order + 1, "Illegal tensor order!");
-    CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
+    //CheckNTErrors(onehot->dataType == X_INT, "The onehot tensor must be in X_INT!")
    CheckNTErrors(index->dataType == X_INT, "The index tensor must be in X_INT!")
    for (int i = 0; i < index->order; i++)
@@ -111,9 +134,14 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
    onehot->SetZeroAll();
+    float confidence = 1 - labelSmoothingP;
+    float lowconfidence = labelSmoothingP / size;
+    //_SetDataFixedFloat(onehot, lowconfidence);
 #ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
-        _CudaIndexToOnehot(index, onehot, size);
+        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
    }
 #endif
@@ -122,12 +150,12 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size)
    int stride = size;
    int * indexData = (int *)index->data;
-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
-        int * od = onehotData + i * stride;
+        DTYPE * od = onehotData + i * stride;
-        od[id] = 1;
+        od[id] = confidence;
    }
 }
@@ -138,9 +166,10 @@ make a new tensor to keep the result and return it
 >> index - index tensor, which value is an integer num
 >> size - the last dimension size of the onehot tensor
+>> confidence - labelsmoothing
 << return - the onehot tensor
 */
-XTensor IndexToOnehot(XTensor & index, int size)
+XTensor IndexToOnehot(XTensor & index, int size, float labelSmoothingP)
 {
    CheckNTErrors(index.dataType == X_INT, "The onehot tensor must be in X_INT!")
@@ -151,9 +180,9 @@ XTensor IndexToOnehot(XTensor & index, int size)
    int * dim = new int[order + 1];
    memcpy(dim, index.dimSize, order * sizeof(int));
    dim[order] = size;
-    InitTensor(&onehot, index.order + 1, dim, X_INT, 1.0F, index.devID, index.mem);
+    InitTensor(&onehot, index.order + 1, dim, X_FLOAT, 1.0F, index.devID, index.mem);
-    _IndexToOnehot(&index, &onehot, size);
+    _IndexToOnehot(&index, &onehot, size, labelSmoothingP);
    delete[] dim;

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
@@ -96,7 +96,7 @@ convert index tensor to onehot tensor (kernel version)
 >> stride - stride of a data block
 */
 __global__
-void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int stride)
+void KernelIndexToOnehot(DTYPE * onehotData, int * indexData, int blockNum, int stride, float confidence, float lowconfidence)
 {
    /* block id */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -107,10 +107,16 @@ void KernelIndexToOnehot(int * onehotData, int * indexData, int blockNum, int st
    if (i >= blockNum || offset >= stride)
        return;
-    int * od = onehotData + i * stride;
+    DTYPE * od = onehotData + i * stride;
    int id = indexData[i];
-    od[id] = 1;
+    //od[id] = confidence;
+    if (offset == id)
+        od[offset] = confidence;
+    else{
+        od[offset] = lowconfidence;
+    }
 }
 /* 
@@ -120,7 +126,7 @@ convert index tensor to onehot tensor (cuda version)
 >> onehot - onehot tensor, which value is 0 or 1
 >> size - the last dimension size of the onehot tensor
 */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence)
 {
    int devID = onehot->devID;
@@ -138,10 +144,10 @@ void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size)
    dim3 blocks(cudaGrids[0], cudaGrids[1]);
    dim3 threads(cudaBlocks[0], cudaBlocks[1]);
-    int * onehotData = (int *)onehot->data;
+    DTYPE * onehotData = (DTYPE *)onehot->data;
    int * indexData = (int *)index->data;
-    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride);
+    KernelIndexToOnehot<<<blocks, threads >>>(onehotData, indexData, blockNum, stride, confidence, lowconfidence);
    BacktoCudaDev(devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 void _CudaOnehotToIndex(XTensor * onehot, XTensor * index, int size);
 /* convert index tensor to onehot tensor (cuda version) */
-void _CudaIndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _CudaIndexToOnehot(const XTensor * index, XTensor * onehot, int size, float confidence, float lowconfidence);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
@@ -27,18 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* convert onehot tensor to index tensor */
-void _OnehotToIndex(XTensor * onehot, XTensor * index, int size);
+void _OnehotToIndex(XTensor * onehot, XTensor * index, int dim);
 /* convert onehot tensor to index tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor OnehotToIndex(XTensor & onehot, int num);
+XTensor OnehotToIndex(XTensor & onehot, int size);
 /* convert index tensor to onehot tensor */
-void _IndexToOnehot(XTensor * index, XTensor * onehot, int size);
+void _IndexToOnehot(const XTensor * index, XTensor * onehot, int size, float labelSmoothingP = 0.0F);
 /* convert index tensor to onehot tensor (return an XTensor structure)
 make a new tensor to keep the result and return it */
-XTensor IndexToOnehot(XTensor & index, int num);
+XTensor IndexToOnehot(XTensor & index, int num, float labelSmoothingP = 0.0F);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -168,6 +168,17 @@ void SetDataFixed(XTensor &tensor, DTYPE p)
 {
    _SetDataFixed(&tensor, &p);
 }
+/*
+generate data items with a fixed value p (in integer)
+>> tensor - the tensor whose data array would be initialized
+>> p - an integer
+*/
+void SetDataFixedInt(XTensor &tensor, int p)
+{
+    CheckNTErrors(tensor.dataType == X_INT, "An integer tensor is required!");
+    _SetDataFixed(&tensor, &p);
+}
 /* 
 generate data items with a fixed value p (in integer) 
@@ -387,7 +398,7 @@ generate data items with a uniform distribution in [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
+void _SetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
@@ -440,9 +451,9 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
 >> p - the threshold
 >> value - the value we intend to assign to the item
 */
-void _SetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+void _SetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
 {
-    //CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
+    CheckNTErrors(tensor->dataType == DEFAULT_DTYPE, "TODO");
    if (tensor->devID < 0) {
        _SetDataRand(tensor, lower, upper);

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
@@ -27,8 +27,6 @@
 #include <curand_kernel.h>
 #include "../../XDevice.h"
 #include "../../XUtility.h"
-#include "../getandset/ConvertDataType.h"
-#include "../movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -188,30 +186,6 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
 }
 /*
-set data array with a uniform distribution in [low, high]
->> deviceStates - the state of curand
->> d - float datatype pointer to the data array
->> size - size of the array
->> lower - low value of the range
->> variance - the variance of the range
-*/
-__global__
-void KernelSetDataRandHalf(half * d, int size, DTYPE lower, DTYPE variance)
-{
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-	half lowerHalf = __float2half(lower);
-	half varianceHalf = __float2half(variance);
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-	if (i < size) {
-		d[i] = d[i] * varianceHalf + lowerHalf;
-	}
-#endif
-}
-/*
 set data items to a pre-defined value if its value >= p, set it to 0 otherwise
 >> d - pointer to the data array
 >> size - size of the array
@@ -231,24 +205,6 @@ void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
    }
 }
-__global__
-void KernelSetDataPCutHalf(half * d, int size, DTYPE p, DTYPE value)
-{
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    half halfP = __float2half(p);
-    half halfValue = __float2half(value);
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-    if (i < size) {
-        if (d[i] >= halfP)
-            d[i] = halfValue;
-        else
-            d[i] = 0;
-    }
-#endif
-}
 /* 
 set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
 >> tensor - the tensor whose data array would be initialized
@@ -501,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
 >> lower - lower value of the range
 >> upper - upper value of the range
 */
-void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper) 
+void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper)
 {
    CheckNTErrors(upper > lower, "the high value must be greater than low value!");
@@ -516,44 +472,21 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-	XTensor tensor1(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
+    curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
+    curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
-	if (tensor->dataType == X_FLOAT){
-		curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
-		curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
-	}
-	else{
-		curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
-		curandGenerateUniform(gen, (float*)tensor1.data, tensor1.unitNum);
-	}
-    //curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;  
-    //curandGenerateUniform(gen, (float*)tensor->data, tensor->unitNum);
    DTYPE variance = upper - lower;
-    if (variance != 1.0F || lower != 0) {
+    if(variance != 1.0F || lower != 0){
-        if (tensor->dataType == X_FLOAT) {
+        if (tensor->dataType == X_FLOAT)
-            KernelSetDataRandFloat << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, lower, variance);
+            KernelSetDataRandFloat  <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
-        }         
+        else if (tensor->dataType == X_DOUBLE)
-        else if (tensor->dataType == X_DOUBLE) {
+            KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
-            KernelSetDataRandDouble << <blocks, threads >> >((double*)tensor->data, tensor->unitNum, lower, variance);
-        }
-        else if (tensor->dataType == X_FLOAT16) {    
-            _ConvertDataType(&tensor1, tensor);   
-            KernelSetDataRandHalf << <blocks, threads >> >((half*)tensor->data, tensor->unitNum, lower, variance);
-        }
-        else {
-            ShowNTErrors("TODO!");
-        }
    }
-    else if (tensor->dataType == X_FLOAT16) {
-        _ConvertDataType(&tensor1, tensor);
-    }
    BacktoCudaDev(tensor->devID, devIDBackup);
 }
 /* 
 generate data items with a uniform distribution in [lower, upper] and set
 the item to a pre-defined value if the item >= p, set the item to 0 otherwise 
@@ -563,7 +496,7 @@ the item to a pre-defined value if the item >= p, set the item to 0 otherwise
 >> p - the threshold
 >> value - the value we intend to assign to the item
 */
-void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
+void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
 {
    _CudaSetDataRand(tensor, lower, upper);
@@ -577,16 +510,8 @@ void _CudaSetDataRandP(XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYP
    int devIDBackup;
    ProtectCudaDev(tensor->devID, devIDBackup);
-    if (tensor->dataType == X_FLOAT) {
+    KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
-        KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
-    }
-    else if (tensor->dataType == X_FLOAT16) {
-        KernelSetDataPCutHalf << <blocks, threads >> >((__half*)tensor->data, tensor->unitNum, p, value);
-    }
-    else {
-        ShowNTErrors("TODO!")
-    }   
    BacktoCudaDev(tensor->devID, devIDBackup);
 }

--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cpp
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cpp
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cu
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cuh
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cuh
--- a/source/tensor/function/SoftmaxWithCrossEntropy.h
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/loss/CrossEntropy.cuh
+++ b/source/tensor/loss/CrossEntropy.cuh
--- a/source/tensor/loss/CrossEntropy.h
+++ b/source/tensor/loss/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h