Clean the codes

9a477f7d · linye · 44bf9fa6 · 9a477f7d · 9a477f7d · 9a477f7d
Commit 9a477f7d authored Jul 18, 2019 by linye
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
@@ -92,7 +92,7 @@ dE/da = IndexToOnehot(b)
 >> isEfficient - indicates whether the computation is in
                 an efficient manner
 */
-void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
+void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
@@ -102,10 +102,19 @@ void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);

 	node->visitMark = NODE_FINISHED;
-
 }

-void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
+/* 
+gradient computation for IndexToOnehot
+for
+b = IndexToOnehot(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
@@ -115,7 +124,6 @@ void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
    XNoder::MakeGrad(input);

 	node->visitMark = NODE_FINISHED;
-
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */

 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@

 namespace nts{

+
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+
+        return;
+    }
+
+    gold = income.tails[1];
+
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+         leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
+
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)

--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,6 +35,14 @@ namespace nts{
 class XLossGrad
 {
 public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficient);
+
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
+
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
                 XTensor * dedy, XTensor * dedx, XTensor * padding,

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
+    else if(operID == MATH_SCALE)
+        GradScale(node, isEfficient);
+    else if(operID == MATH_DESCALE)
+        GradDescale(node, isEfficient);
+    else if(operID == MATH_SHIFT)
+        GradShift(node, isEfficient);
    else if(operID == MATH_SUB)
        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
@@ -99,6 +105,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradReduceSumSquared(node, isEfficient);
    else if(operID == REDUCE_REDUCEVARIANCE)
        GradReduceVariance(node, isEfficient);
+    else if (operID == MATH_MULANDSHIFT)
+        GradMulAndShift(node, isEfficient);
    else{
        ShowNTErrors("TODO!");
    }
@@ -717,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0]; 
    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
        _Multiply(node->grad, b, a->grad, 1.0F);
-    _Multiply(node->grad, a, b->grad, 1.0F);
+    }
+
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
+        _Multiply(node->grad, a, b->grad, 1.0F);;
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -887,88 +901,8 @@ gradient for normalize
 */
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
-    ShowNTErrors("This is really a bad piece of code!!!");
-    
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
-
-    XTensor * input = income.tails[0];
-    XTensor * mean = income.tails[1];
-    XTensor * var = income.tails[2];
-    XTensor * a = income.tails[3];
-    XTensor * b = income.tails[4];
-    XTensor * c = NewTensor(var);
-    XTensor * d = NewTensor(a);
-    XTensor * e = NewTensor(a);
-    XTensor * f = NewTensor(a);
-    XTensor * g = NewTensor(a);
-    XTensor * h = NewTensor(a);
-    XTensor * i = NewTensor(a);
-    XTensor * j = NewTensor(a);
-    XTensor * k = NewTensor(var);
-    XTensor * p = NewTensor(var);
-    XTensor * q = NewTensor(var);
-    XTensor * r = NewTensor(a);
-    XTensor * x = NewTensor(mean);
-    XTensor * y = NewTensor(mean);
-    XTensor * z = NewTensor(mean);
-    DTYPE epsilon = income.GetParam(1);
-
-    int dim = income.GetParamInt(0);
-    int n = a->GetDim(dim);
-    XNoder::MakeGrad(input);
-    XNoder::MakeGrad(mean);
-    XNoder::MakeGrad(var);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-
-    /* dEdinput */
-    _ScaleAndShift(var, c, 1.0F, epsilon);
-    _Unsqueeze(c, d, dim, n);
-    _Power(d, e, -0.5F);
-    _Multiply(a, e, f);
-    _Multiply(node->grad, f, input->grad, 1.0F);
-
-    /* dEdmean */
-    _ScaleAndShift(f, g, -1.0F);
-    _ReduceSum(g, x, dim);
-    _ReduceSum(node->grad, y, dim);
-    _Multiply(y, x, mean->grad, 1.0F);
-
-    /* dEdvar */
-    _Unsqueeze(mean, h, dim, n);
-    _Sub(input, h, i);
-    _Multiply(a, i, j);
-    _Power(var, k, -1.5F);
-    _ScaleAndShift(k, p, -0.5F);
-    _ReduceSum(j, z, dim);
-    _Multiply(z, p, q);
-    _Multiply(y, q, var->grad, 1.0F);
-
-    /* dEda */
-    _Multiply(i, e, r);
-    _Multiply(node->grad, r, a->grad, 1.0F);
-
-    /* dEdb */
-    _Sum(b->grad, node->grad, b->grad);
-
-    node->visitMark = NODE_FINISHED;
+    ShowNTErrors("TODO!");
    
-    delete c;
-    delete d;
-    delete e;
-    delete f;
-    delete g;
-    delete h;
-    delete i;
-    delete j;
-    delete k;
-    delete p;
-    delete q;
-    delete r;
-    delete x;
-    delete y;
-    delete z;
 }

 /*
@@ -1029,6 +963,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 }

 /*
+gradient for Scale
+for
+c = a * scale
+we have
+dE/da = dE/dc * scale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradScale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALE!");
+
+    XTensor * a = income.tails[0];
+
+    DTYPE scale = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad, scale);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for Descale
+for
+c = a / descale
+we have
+dE/da = dE/dc / descale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for DESCALE!");
+
+    XTensor * a = income.tails[0];
+
+    DTYPE descale = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad, 1/descale);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for Shift
+for
+c = a + shift
+we have
+dE/da = dE/dc
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SHIFT!");
+
+    XTensor * a = income.tails[0];
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
 gradient for minus
 for
 c =  a - b * \beta
@@ -1487,4 +1497,126 @@ void XMathGrad::GradReduceVariance(XTensor * node, bool isEfficient)
    node->visitMark = NODE_FINISHED;
 }

+
+/*
+gradient for operation
+for c = matmul(x, w) + b 
+we have
+dE/dx = dE/dc * w^T
+dE/dw = x^T * dE/dc
+dE/db = dE/dc * x.reduce(0,...,n-1,n+1,...)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradMulAndShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 3, "wrong input tensor number")
+
+    XTensor * x = income.tails[0];
+    XTensor * w = income.tails[1];
+    XTensor * b = income.tails[2];
+
+    int n = income.GetParamInt(0);
+    MATRIX_TRANS_TYPE transW = income.GetParamTrans(1);
+    MATRIX_TRANS_TYPE transX = income.GetParamTrans(2);
+
+    if (!isEfficient || w->isGrad)
+        XNoder::MakeGrad(w);
+    if (!isEfficient || x->isGrad)
+        XNoder::MakeGrad(x);
+    if (!isEfficient || b->isGrad)
+        XNoder::MakeGrad(b);
+
+    int order = node->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, node->dimSize, sizeof(int) * node->order);
+
+    /* compute dE/db */
+    if (n == order - 1) {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = node->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+        size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(node->grad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else {
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for (int i = 0; i < order; i++) {
+            if (i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = node->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+        Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+        _ReduceSum(interGrad, bGradTMP, 0);
+        _Sum(bGradTMP, b->grad, b->grad);
+        DelTensorBuf(bGradTMP);
+
+        node->grad->Reshape(order, dimSize);
+
+        DelTensorBuf(interGrad);
+
+    }
+
+
+    /* compute dE/dx, dE/dw */
+    XTensor * c = node;
+    XTensor * dedc = node->grad;
+    XTensor * dedw = w->grad;
+    XTensor * dedx = x->grad;
+
+    if (x->order == 2 && w->order == 2)
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+    else if (transX == X_NOTRANS && x->order > 2 && w->order == 2){
+        int orderBackupX = x->order;
+        int orderBackupC = c->order;
+        int dimsBackupX[MAX_TENSOR_DIM_NUM];
+        int dimsBackupC[MAX_TENSOR_DIM_NUM];
+        memcpy(dimsBackupX, x->dimSize, sizeof(int) * x->order);
+        memcpy(dimsBackupC, c->dimSize, sizeof(int) * c->order);
+
+        x->Reshape(x->unitNum / x->GetDim(-1), x->GetDim(-1));
+        c->Reshape(c->unitNum / c->GetDim(-1), c->GetDim(-1));
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(dedx->unitNum / dedx->GetDim(-1), dedx->GetDim(-1));
+        dedc->Reshape(dedc->unitNum / dedc->GetDim(-1), dedc->GetDim(-1));
+
+        GradMatrixMul(x, dedx, transX, w, dedw, transW, dedc, 1.0F, isEfficient);
+
+        x->Reshape(orderBackupX, dimsBackupX);
+        c->Reshape(orderBackupC, dimsBackupC);
+        if (!isEfficient || x->isGrad)
+            dedx->Reshape(orderBackupX, dimsBackupX);
+        dedc->Reshape(orderBackupC, dimsBackupC);
+
+    }
+
+    node->visitMark = NODE_FINISHED;
+
+}
+
 }
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -130,6 +130,18 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);

+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
+
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);
@@ -168,6 +180,10 @@ private:
    /* gradient for reduceVariance */
    static
    void GradReduceVariance(XTensor * node, bool isEfficient);
+
+    /* gradient for operation */
+    static
+    void GradMulAndShift(XTensor * node, bool isEfficient);
 };

 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");

    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -127,6 +129,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 }

 /*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    
+    input->grad->Reshape(order1, dimSize1);
+
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+
+    input->grad->Reshape(order, dimSize);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
 gradient for merge
 for 
 c = merge(a_0, a_1, ...)

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);

+    /* gradient computation for dropout with indexs */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -21,14 +21,14 @@

 #include "XNet.h"
 #include "XNoder.h"
-#include "XBackwardData.h"
 #include "XBackwardLoss.h"
 #include "XBackwardMath.h"
 #include "XBackwardFunc.h"
+#include "XBackwardData.h"
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"

-namespace nts{
+namespace nts {

 unsigned int netIDGlobal = 0;
 MUTEX_HANDLE netMutex;
@@ -36,7 +36,7 @@ MUTEX_HANDLE netMutex;
 /* generate a network id */
 unsigned int MakeNetID()
 {
-    if(netIDGlobal == 0)
+	if (netIDGlobal == 0)
 		MUTEX_INIT(netMutex);

 	MUTEX_LOCK(netMutex);
@@ -180,66 +180,67 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
 	Traverse(roots);

 	/* label tensors where the backward computation is neccessary */
-    if(isGradEfficient)
+	if (isGradEfficient)
 		MakeEfficientNet();

-    for(int i = 0; i < nodes.count; i++){
+	for (int i = 0; i < nodes.count; i++) {
 		XTensor * node = (XTensor*)nodes.Get(i);
 		node->visitMark = NODE_UNFINISHED;
 	}

-    XLossGrad lossGrad;
-
-    /* we start with the gradient with respect to the loss for output layers */
-    for(int i = 0; i < roots.count; i++){
-        XTensor * root = (XTensor*)roots.Get(i);
-        XTensor * gold = (XTensor*)golds.Get(i);
-        XTensor * padding = (XTensor*)paddings.Get(i);
-        XLink &income = root->income;
-        int funcID = income.typeID;
-        void * params = income.params;
-
-        /* we compute dE/dx if the output is generated by an activation function y = f(x).
-           Note that we do not need to obtain dE/dy here because it is no use in the 
-           folloing process of back-propagation */
-        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
-            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
-                XTensor * x = income.tails[0];
-                XNoder::MakeGrad(x);
-                lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
-                root->visitMark = NODE_FINISHED;
-            }
-            else {
-                XNoder::MakeGrad(root);
-                lossGrad.Compute(gold, root, root->grad, padding, loss);
-            }
-        }
-        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        else{
-            XNoder::MakeGrad(root);
-            lossGrad.Compute(gold, root, root->grad, NULL, loss);
-        }
-    }
+	//XLossGrad lossGrad;
+
+	///* we start with the gradient with respect to the loss for output layers */
+	//for (int i = 0; i < roots.count; i++) {
+	//	XTensor * root = (XTensor*)roots.Get(i);
+	//	XTensor * gold = (XTensor*)golds.Get(i);
+	//	XTensor * padding = (XTensor*)paddings.Get(i);
+	//	XLink &income = root->income;
+	//	int funcID = income.typeID;
+	//	void * params = income.params;
+
+	//	/* we compute dE/dx if the output is generated by an activation function y = f(x).
+	//		Note that we do not need to obtain dE/dy here because it is no use in the
+	//		folloing process of back-propagation */
+
+	//	if (gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)) {
+	//		if (funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
+	//			XTensor * x = income.tails[0];
+	//			XNoder::MakeGrad(x);
+	//			lossGrad.Compute(gold, root, x, NULL, x->grad, padding, funcID, params, loss);
+	//			root->visitMark = NODE_FINISHED;
+	//		}
+	//		else {
+	//			XNoder::MakeGrad(root);
+	//			lossGrad.Compute(gold, root, root->grad, padding, loss);
+	//		}
+	//	}
+	//	/* we compuate dE/dy (y is the output) if no predefined activation function is used */
+	//	else {
+	//		XNoder::MakeGrad(root);
+	//		lossGrad.Compute(gold, root, root->grad, NULL, loss);
+	//	}
+	//}

 	/* back-propagation from output to input */
-    for(int i = nodes.count - 1; i >= 0; i--){
+	for (int i = nodes.count - 1; i >= 0; i--) {
 		XTensor * node = (XTensor*)nodes.Get(i);

-        if(node->mem != NULL){
+		if (node->mem != NULL) {
 			CheckNTErrors(node->mem->bufUsed < BUF_PITCH, "Illegal access of buffer!");
 		}

-        if(node->visitMark != NODE_FINISHED)
+		if (node->visitMark != NODE_FINISHED)
 			BackwardNode(node, isGradEfficient);

-        if(isGradEfficient){
+		if (isGradEfficient) {
 			XLink & outgo = node->outgo;
-            for(int i = 0; i < outgo.tailNum; i++){
+			for (int i = 0; i < outgo.tailNum; i++) {
 				XTensor * parent = outgo.tails[i];
 				ClearGrad(parent);
 			}

-            if(XNoder::IsLeaf(node))
+			if (XNoder::IsLeaf(node))
 				ClearGrad(node);
 		}
 	}
@@ -253,27 +254,29 @@ backward computation for a given node
 */
 void XNet::BackwardNode(XTensor * node, bool isEfficent)
 {
-    if(node == NULL || node->visitMark == NODE_FINISHED)
+	if (node == NULL || node->visitMark == NODE_FINISHED)
 		return;

-    if(!XNoder::IsLeaf(node)){
+	if (!XNoder::IsLeaf(node)) {
 		/* post processing for parent nodes */
 		BackwardNodePost(node, isEfficent);

 		/* process the current node */
-        if(XMathGrad::IsMathOP(node))
+		if (XMathGrad::IsMathOP(node))
 			XMathGrad::MakeGrad(node, isEfficent);
-        else if(XFuncGrad::IsFunc(node))
+		else if (XFuncGrad::IsFunc(node))
 			XFuncGrad::MakeGrad(node, isEfficent);
 		else if (XDataGrad::IsDataOP(node))
 			XDataGrad::MakeGrad(node, isEfficent);
-        else if(XShapeGrad::IsShapeOP(node))
+		else if (XShapeGrad::IsShapeOP(node))
 			XShapeGrad::MakeGrad(node, isEfficent);
-        else{
+        else if (XLossGrad::IsLossOP(node))
+			XLossGrad::MakeGrad(node, isEfficent);
+		else {
 			ShowNTErrors("Wrong node type!");
 		}
 	}
-    else{
+	else {
 		node->visitMark = NODE_FINISHED;
 	}
 }
@@ -287,12 +290,12 @@ void XNet::BackwardNodePost(XTensor * node, bool isEfficent)
 {
 	bool isSplitList = false;
 	XLink &outgo = node->outgo;
-    for(int i = 0; i < outgo.tailNum; i++){
-        if(outgo.tails[i]->income.typeID == SHAPE_SPLIT_LIST)
+	for (int i = 0; i < outgo.tailNum; i++) {
+		if (outgo.tails[i]->income.typeID == SHAPE_SPLIT_LIST)
 			isSplitList = true;
 	}

-    if(isSplitList)
+	if (isSplitList)
 		XShapeGrad::PostProcessing(node, SHAPE_SPLIT_LIST, isEfficent);
 }

@@ -322,13 +325,13 @@ void XNet::Traverse(XList &roots)
 	for (int i = 0; i < roots.count; i++)
 		TarjanVisit((XTensor*)roots.Get(i), nodes, id);

-    for(int i = 0; i < nodes.count; i++){
+	for (int i = 0; i < nodes.count; i++) {
 		XTensor * node = (XTensor*)nodes.Get(i);
-        if(XNoder::IsRoot(node))
+		if (XNoder::IsRoot(node))
 			outputs.Add(node);
-        if(XNoder::IsLeaf(node))
+		if (XNoder::IsLeaf(node))
 			inputs.Add(node);
-        if(XNoder::IsGrad(node))
+		if (XNoder::IsGrad(node))
 			gradNodes.Add(node);
 	}
 }
@@ -341,26 +344,26 @@ depth-first search given a node (Tarjan's algorithm for topological ordering)
 */
 void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
 {
-    if(node == NULL)
+	if (node == NULL)
 		return;

 	//fprintf(stderr, "%d\n", node->id);
-    if(node->visitMark == code + 1){
+	if (node->visitMark == code + 1) {
 		ShowNTErrors("There is a circle in the network\n");
 	}
-    else if(node->visitMark <= code){
+	else if (node->visitMark <= code) {
 		node->visitMark = code + 1;
 		XLink &income = node->income;
-        for(int i = 0; i < income.tailNum; i++){
+		for (int i = 0; i < income.tailNum; i++) {
 			XTensor * child = income.tails[i];
-            if(child == NULL)
+			if (child == NULL)
 				continue;
 			TarjanVisit(child, orders, code);
 		}
 		node->visitMark = code + 2;
 		orders.Add(node);
 	}
-    else if(node->visitMark == code + 2){
+	else if (node->visitMark == code + 2) {
 	}
 }

@@ -370,11 +373,11 @@ dump network information
 */
 void XNet::Dump(FILE * file)
 {
-    for(int i = 0; i < nodes.count; i++){
+	for (int i = 0; i < nodes.count; i++) {
 		XTensor * node = (XTensor*)nodes.Get(i);
 		fprintf(file, "node %d: %d\n", i, node->id);
 		node->Dump(file, "tensor: ");
-        if(node->grad != NULL)
+		if (node->grad != NULL)
 			node->grad->Dump(file, "grad: ");
 		else
 			fprintf(file, "no gradient!\n");
@@ -395,12 +398,12 @@ void XNet::SetGradEfficientFlag(bool flag)
 void XNet::MakeEfficientNet()
 {
 	/* back-propagation from output to input */
-    for(int i = 0; i < nodes.count; i++){
+	for (int i = 0; i < nodes.count; i++) {
 		XTensor * node = (XTensor*)nodes.Get(i);
 		XLink &income = node->income;
-        for(int j = 0; j < income.tailNum; j++){
+		for (int j = 0; j < income.tailNum; j++) {
 			XTensor * child = income.tails[j];
-            if(child->isGrad || child->isVar){
+			if (child->isGrad || child->isVar) {
 				node->SetGradFlag(true);
 				break;
 			}
@@ -415,25 +418,25 @@ clear the graident information if the node is no use
 */
 void XNet::ClearGrad(XTensor * node)
 {
-    if(node->isVar)
+	if (node->isVar)
 		return;
-    if(node->grad == NULL)
+	if (node->grad == NULL)
 		return;
-    if(node->visitMark != NODE_FINISHED)
+	if (node->visitMark != NODE_FINISHED)
 		return;

 	XLink & income = node->income;

 	bool finished = true;
-    for(int i = 0; i < income.tailNum; i++){
+	for (int i = 0; i < income.tailNum; i++) {
 		XTensor * child = income.tails[i];
-        if(child->visitMark != NODE_FINISHED){
+		if (child->visitMark != NODE_FINISHED) {
 			finished = false;
 			break;
 		}
 	}

-    if(finished){
+	if (finished) {
 		//fprintf(stderr, "del %d %ld\n", node->id, node->grad->unitNum);
 		delete node->grad;
 		node->grad = NULL;
@@ -455,10 +458,21 @@ void XNet::ShowNetwork(FILE * file, XTensor * node)
 	XLink::ShowNode(file, node);

 	/* go over nodes in its topological order */
-    for(int i = nodes.count - 1; i >= 0; i--){
+	for (int i = nodes.count - 1; i >= 0; i--) {
 		XTensor * n = (XTensor*)nodes.Get(i);
 		XLink::ShowNode(file, n);
 	}
 }

+
+/*
+search for a node in a top-down manner by its name
+>> top - the top most node
+<< return - the node we found
+*/
+//XTensor * XNet::SearchNode(XTensor * top, const char * name)
+//{
+	//return XLink::SearchNode(top, name);
+//}
+
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XNET_H__
 #define __XNET_H__
@@ -111,6 +112,10 @@ struct XNet

    /* show network topology */
    void ShowNetwork(FILE * file, XTensor * node);
+
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };

 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -839,38 +839,14 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        InitModelTensor2D(s, batchSize, model.vSize, model);
        InitModelTensor2D(y, batchSize, model.vSize, model);

-        ///* s = h_last * w  */
-        //_MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
-
-        XTensor h_last1;
-        h_last1 = ScaleAndShift(h_last, 100, 0);
-
-        XTensor w1;
-        w1 = ScaleAndShift(w, 100, 0);
-
-        XTensor int8H_last;
-        XTensor int8W;
-
-        int8H_last = ConvertDataType(h_last1, X_INT8);
-        int8W = ConvertDataType(w1, X_INT8);
-
-        XTensor s1;
-        InitTensor2D(&s1, batchSize, model.vSize, X_INT, model.devID, model.mem);
-        _MatrixMul2D(&int8H_last, X_NOTRANS, &int8W, X_NOTRANS, &s1);       
+        /* s = h_last * w  */
+        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);

        XTensor b2D;
-        InitTensor2D(&b2D, batchSize, model.vSize, X_FLOAT, model.devID, model.mem);
+        InitTensor(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);

-        b2D = ScaleAndShift(b2D, 10000, 0);
-
-        XTensor b2D1;
-        b2D1 = ConvertDataType(b2D, X_INT);
-
-        _Sum(&s1, &b2D1, &s1);
-
-        s = ConvertDataType(s1, X_FLOAT);
-        s = ScaleAndShift(s, 0.0001, 0);
+        _Sum(&s, &b2D, &s);

        /* y = softmax(s) */
        _LogSoftmax(&s, &y, 1);
@@ -1224,6 +1200,7 @@ void Test(const char * test, const char * result, FNNModel &model)
    }

    fclose(file);
+    fclose(ofile);

    double elapsed = GetClockSec() - startT;


--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -53,43 +53,6 @@ initialize the model
 >> myDevID - device id
 >> myMem - the memory pool
 */
-//void T2TAttention::InitModel(int argc, char ** argv, 
-//                             bool myIsMasked, int myIgnored, 
-//                             int myDevID, XMem * myMem)
-//{
-//    devID = myDevID;
-//    mem = myMem;
-//    isMasked = myIsMasked;
-//    ignored = myIgnored;
-//    
-//    float minmax = 0;
-//
-//    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-//    LoadParamInt(argc, argv, "d", &dk, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamInt(argc, argv, "d", &dv, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);
-//    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
-//    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);
-//
-//    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-//    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-//    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-//    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-//    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
-//
-//    float scale = 1.0F;
-//    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
-//    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
-//    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-//    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));
-//
-//    wk.SetDataRand(-finfoutk, finfoutk);
-//    wq.SetDataRand(-finfoutk, finfoutk);
-//    wv.SetDataRand(-finfoutv, finfoutv);
-//    wa.SetDataRand(-finfouta, finfouta);
-//    wbig.SetDataRand(-finfoutbig, finfoutbig);
-//}
-
 void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
                             int myDevID, XMem * myMem)
@@ -108,17 +71,17 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wk, d, dk, X_FLOAT16, devID, mem);
-    InitTensor2D(&wq, d, dk, X_FLOAT16, devID, mem);
-    InitTensor2D(&wv, d, dv, X_FLOAT16, devID, mem);
-    InitTensor2D(&wa, d, d, X_FLOAT16, devID, mem);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT16, devID, mem);
+    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
+    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
+    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
+    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
+    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);

    float scale = 1.0F;
-    float finfoutk = (float)sqrt(6.0F * scale / (d + dk));
-    float finfoutv = (float)sqrt(6.0F * scale / (d + dv));
+    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
+    float finfoutv = (float)sqrt(6.0F * scale/(d + dv));
    float finfouta = (float)sqrt(6.0F * scale / (d + d));
-    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3 * d));
+    float finfoutbig = (float)sqrt(6.0F * scale / (d + 3*d));

    wk.SetDataRand(-finfoutk, finfoutk);
    wq.SetDataRand(-finfoutk, finfoutk);
@@ -138,150 +101,95 @@ make the network
 >> isTraining - indicates whether the model is used for training
 << return - multi-attention result
 */
-//XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
-//{
-//    XTensor k2;
-//    XTensor q2;
-//    XTensor v2;
-//
-//    if (selfatt){
-//        
-//        XTensor con;
-//        XList split;
-//
-//        con = MMul(k, wbig);
-//
-//        int d1 = con.GetDim(0);
-//        int d2 = con.GetDim(1);
-//        int d3 = con.GetDim(2) / 3;
-//
-//        InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
-//        InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-//        InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
-//
-//        split.Add(&q2);
-//        split.Add(&k2);
-//        split.Add(&v2);
-//
-//        Split(con, split, 2, 3);
-//    }
-//
-//    else{
-//        /* linear transofmration before self-attention */
-//        k2 = MMul(k, wk);
-//        q2 = MMul(q, wq);
-//        v2 = MMul(v, wv);
-//    }
-//
-//    XTensor kheads;
-//    XTensor qheads;
-//    XTensor vheads;
-//
-//    /* multi head */
-//    kheads = Split(k2, k2.order - 1, nhead);
-//    qheads = Split(q2, q2.order - 1, nhead);
-//    vheads = Split(v2, v2.order - 1, nhead);
-//
-//    XTensor att;
-//    XTensor dot;
-//    XTensor scalar;
-//
-//    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-//    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
-//
-//    if(isMasked)
-//        dot = dot + mask;
-//
-//    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
-//
-//    scalar = Softmax(dot, -1);
-//    
-//    if(isTraining && dropoutP > 0)
-//        scalar = Dropout(scalar, dropoutP);
-//
-//    att = BMMul(scalar, vheads);
-//
-//    /* concatenate the heads */
-//    return MMul(Merge(att, att.order - 1), wa);
-//}
-
-XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt)
+XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
 {
-    XTensor halfK2;
-    XTensor halfQ2;
-    XTensor halfV2;
+    XTensor k2;
+    XTensor q2;
+    XTensor v2;
    
-    XTensor halfK;
-    halfK = ConvertDataType(k, X_FLOAT16);
+    /* linear transformation before self-attention */
+    k2 = MMul(k, wk);
+    q2 = MMul(q, wq);
+    v2 = MMul(v, wv);
    
-    if (selfatt) {
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
    
-        XTensor halfCon;
-        XList halfSplit;
-        halfCon = MMul(halfK, wbig);
+/*
+make the network given a big tensor that keeps keys, queries and values
+>> kqv - the big tensor
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
+{
+    XTensor k2;
+    XTensor q2;
+    XTensor v2;
+    XTensor kqv2;
+    XList split;
    
-        int d1 = halfCon.GetDim(0);
-        int d2 = halfCon.GetDim(1);
-        int d3 = halfCon.GetDim(2) / 3;
+    kqv2 = MMul(kqv, wbig);
    
-        InitTensor3D(&halfK2, d1, d2, d3, X_FLOAT16, devID, mem);
-        InitTensor3D(&halfQ2, d1, d2, d3, X_FLOAT16, devID, mem);
-        InitTensor3D(&halfV2, d1, d2, d3, X_FLOAT16, devID, mem);
+    int d1 = kqv2.GetDim(0);
+    int d2 = kqv2.GetDim(1);
+    int d3 = kqv2.GetDim(2) / 3;
    
-        halfSplit.Add(&halfQ2);
-        halfSplit.Add(&halfK2);
-        halfSplit.Add(&halfV2);
+    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
    
-        Split(halfCon, halfSplit, 2, 3);
-    }
+    split.Add(&q2);
+    split.Add(&k2);
+    split.Add(&v2);
    
-    else {
-        XTensor halfQ;
-        XTensor halfV;
-        halfQ = ConvertDataType(q, X_FLOAT16);
-        halfV = ConvertDataType(v, X_FLOAT16);
+    Split(kqv2, split, 2, 3);
    
-        /* linear transofmration before self-attention */
-        halfK2 = MMul(halfK, wk);
-        halfQ2 = MMul(halfQ, wq);
-        halfV2 = MMul(halfV, wv);
-    }
+    return MakeAttention(k2, q2, v2, mask, isTraining);
+}
    
-    XTensor halfKheads;
-    XTensor halfQheads;
-    XTensor halfVheads;
+/*
+make the attention network given keys, queries and values (after linear transformation)
+>> k - keys. It might be of size B * L * H
+       where B = batch size, L = sequence length,
+       and H = vector size of each position
+>> q - queries
+>> v - values
+>> mask - as it is
+>> isTraining - indicates whether the model is used for training
+*/
+XTensor T2TAttention::MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining)
+{
+    XTensor kheads;
+    XTensor qheads;
+    XTensor vheads;
    
    /* multi head */
-    halfKheads = Split(halfK2, halfK2.order - 1, nhead);
-    halfQheads = Split(halfQ2, halfQ2.order - 1, nhead);
-    halfVheads = Split(halfV2, halfV2.order - 1, nhead);
+    kheads = Split(k, k.order - 1, nhead);
+    qheads = Split(q, q.order - 1, nhead);
+    vheads = Split(v, v.order - 1, nhead);
    
-    XTensor halfAtt;
-    XTensor halfDot;
-    XTensor halfScalar;
+    XTensor att;
+    XTensor dot;
+    XTensor scalar;
    
    /* scalar = softmax(Q * K^T / sqrt(dk)) * V */
-    halfDot = BMMul(halfQheads, X_NOTRANS, halfKheads, X_TRANS);
-
-    //XTensor halfMask(mask.order, mask.dimSize, X_FLOAT16, mask.denseRatio, mask.devID, mask.mem);
+    dot = BMMul(qheads, X_NOTRANS, kheads, X_TRANS);
    
-    if (isMasked) {
-        XTensor halfMask;
-        halfMask = ConvertDataType(mask, X_FLOAT16);
-        halfDot = Sum(halfDot, halfMask);
-    }      
+    if(isMasked)
+        dot = dot + mask;
    
-    halfDot = Linear(halfDot, 1.0F / (float)sqrt((float)dk / nhead));
+    dot = Linear(dot, 1.0F/(float)sqrt((float)dk/nhead));
    
-    halfScalar = Softmax(halfDot, -1);
+    scalar = Softmax(dot, -1);

-    if (isTraining && dropoutP > 0)
-        halfScalar = Dropout(halfScalar, dropoutP);
+    if(isTraining && dropoutP > 0)
+        scalar = Dropout(scalar, dropoutP);
    
-    halfAtt = BMMul(halfScalar, halfVheads);
+    att = BMMul(scalar, vheads);
    
    /* concatenate the heads */
-    return ConvertDataType(MMul(Merge(halfAtt, halfAtt.order - 1), wa), X_FLOAT);
+    return MMul(Merge(att, att.order - 1), wa);
 }

 }
--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -61,6 +61,7 @@ public:
    XTensor wa;
    
    XTensor wbig;
+	
    /* size of transformed Q and K */
    int dk;

@@ -96,7 +97,13 @@ public:
                   int myDevID = -1, XMem * myMem = NULL);

    /* make the network */
-    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining, bool selfatt);
+    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
+    
+    /* make the network given a big tensor that keeps keys, queries and values */
+    XTensor MakeBig(XTensor &kqv, XTensor &mask, bool isTraining);
+    
+    /* make the attention network given keys, queries and values (after linear transformation) */
+    XTensor MakeAttention(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);
 };

 }

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -80,7 +80,6 @@ void AttDecoder::InitModel(int argc, char ** argv,
    attentionsEnde = new T2TAttention[nlayer];
    attEndeLayerNorms = new T2TLN[nlayer];

-
    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
@@ -89,9 +88,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-
    }
-
 }

 /* 
@@ -122,7 +119,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /******************/
        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining, true);
+        att = attentions[i].MakeBig(x, mask, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -136,7 +133,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X

        /*****************************/
        /* encoder-decoder attention */
-        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining, false);
+        ende = attentionsEnde[i].Make(outputEnc, x, outputEnc, maskEncDec, isTraining);

        /* dropout */
        if(isTraining && dropoutP > 0)

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -103,8 +103,6 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo

    x = embedder.Make(input);

-    //x.Dump(tmpFILE, "embedding: ");
-
    /* dropout */
    if(isTraining && dropoutP > 0)
        x = Dropout(x, dropoutP);
@@ -116,7 +114,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        XTensor res;

        /* self attention */
-        att = attentions[i].Make(x, x, x, mask, isTraining, true);
+        att = attentions[i].MakeBig(x, mask, isTraining);
        
        /* dropout */
        if(isTraining && dropoutP > 0)
@@ -160,4 +158,3 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
 }

 }
-
--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -89,13 +89,15 @@ XTensor T2TFNN::Make(XTensor &input, bool isTraining)
    XTensor t1;

    /* t1 = max(0, x * w1 + b1) */
-    t1 = Rectify(MMul(input, w1) + b1);
+    //t1 = Rectify(MMul(input, w1) + b1);
+    t1 = Rectify(MulAndShift(input, w1, b1));
    
    if(isTraining && dropoutP > 0)
        t1 = Dropout(t1, dropoutP);

    /* result = t1 * w2 + b2 */
-    return MMul(t1, w2) + b2;
+    //return MMul(t1, w2) + b2;
+    return MulAndShift(t1, w2, b2);
 }



--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -204,30 +204,48 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor maskDec;
    XTensor maskEncDec;

-    /* generate mask to see "previous" words on the decoder side */
-    //int len = inputDec.GetDim(inputDec.order - 2);
-    //int * dims = new int[inputDec.order + 1];
-    //for(int i = 0; i < inputDec.order; i++)
-    //    dims[i + 1] = inputDec.GetDim(i);
-    //dims[0] = nhead;
-    //dims[inputDec.order] = len;
-    //InitTensor(&maskDec, inputDec.order + 1, dims, X_FLOAT, 1.0F, inputDec.devID, inputDec.mem);
+    /* encoder mask */
+    MakeMTMaskEnc(inputEnc, paddingEnc, maskEnc);
+    
+    /* decoder mask */
+    MakeMTMaskDec(inputEnc, inputDec, paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
+
+    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);

+    outputLayer->Make(decoding, output);
+}
+
+/* 
+make the mask for training MT models 
+>> inputEnc - input of the encoder
+>> inputDec - input of the decoder
+>> paddingEnc - padding of the encoder input
+>> paddingDec - padding of the decoder input
+>> maskEnc - mask of the encoder self-attention
+>> maksDec - mask of the decoder self-attention
+>> maksEncDec - mask of the decoder enc-dec attention
+*/
+void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec, 
+                          XTensor &paddingEnc, XTensor &paddingDec, 
+                          XTensor &maskEnc,    XTensor &maskDec,    XTensor &maskEncDec)
+{
    int len = inputDec.GetDim(inputDec.order - 1);
    int * dims = new int[inputDec.order + 2];
    for(int i = 0; i < inputDec.order; i++)
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
        
-    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
       a given sequence. */
    _SetDataLowTri(&maskDec, 1e9F, 0);
    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);

-    /* encoder-decoder mask that prevent the attention to padding dummy words */
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);

@@ -236,8 +254,6 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
-    //_Unsqueeze(&paddingDec, maskEncDecTMPDec, paddingEnc.order, paddingEnc.GetDim(-1));
-    //_Multiply(maskEncDecTMPDec, maskEncDecTMPEnc, maskEncDecTMPDec);
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);

@@ -273,13 +289,6 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
    /* generate the mask on the source language side (for padding) */
    _Sum(&maskEnc, padding3, &maskEnc);

-    encoding = MakeEncoder(inputEnc, maskEnc, isTraining);
-    //encoding.Dump(stderr, "encoding",10);
-
-    decoding = MakeDecoder(inputDec, encoding, maskDec, maskEncDec, isTraining);
-    //decoding.Dump(stderr, "decoding", 10);
-    outputLayer->Make(decoding, output);
-
    delete[] dims;
    delete[] dimsPadding;

@@ -288,6 +297,91 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
 }
    
 /*
+make the mask of the encoder
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder input
+>> maskEnc - mask of the encoder self-attention
+*/
+void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc)
+{
+    /* padding on the source side */
+    int * dimsPadding = new int[paddingEnc.order + 2];
+    for (int i = 0; i < paddingEnc.order - 1; i++)
+        dimsPadding[i] = paddingEnc.GetDim(i);
+    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
+    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
+    
+    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    
+    for (int i = 0; i < padding2->order; i++)
+        dimsPadding[i + 1] = padding2->GetDim(i);
+    dimsPadding[0] = nhead;
+    
+    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    
+    /* mask of the padding */
+    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
+    _Unsqueeze(padding2, padding3, 0, nhead);
+    
+    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
+    
+    InitTensor(&maskEnc, padding3);
+    maskEnc.SetZeroAll();
+    
+    /* generate the mask on the source language side (for padding) */
+    _Sum(&maskEnc, padding3, &maskEnc);
+    
+    DelTensorBuf(padding3);
+    DelTensorBuf(padding2);
+    delete[] dimsPadding;
+}
+    
+/*
+make the mask of the decoder
+>> inputEnc - input of the encoder
+>> inputDec - input of the decoder
+>> paddingEnc - padding of the encoder input
+>> paddingDec - padding of the decoder input
+>> maksDec - mask of the decoder self-attention
+>> maksEncDec - mask of the decoder enc-dec attention
+*/
+void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                             XTensor &paddingEnc, XTensor &paddingDec,
+                             XTensor &maskDec, XTensor &maskEncDec)
+{
+    int len = inputDec.GetDim(inputDec.order - 1);
+    int * dims = new int[inputDec.order + 2];
+    for(int i = 0; i < inputDec.order; i++)
+        dims[i + 1] = inputDec.GetDim(i);
+    dims[0] = nhead;
+    dims[inputDec.order + 1] = len;
+    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    
+    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
+     this matrix can be used to prevent the attention to current or following words in
+     a given sequence. */
+    _SetDataLowTri(&maskDec, 1e9F, 0);
+    _ScaleAndShiftMe(&maskDec, 1.0F, -1e9F);
+    
+    /* encoder-decoder mask that prevents the attention to padding dummy words */
+    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
+    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    
+    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    
+    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
+    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
+    _Unsqueeze(maskEncDecTMPEnc, &maskEncDec, 0, dims[0]);
+    
+    DelTensorBuf(maskEncDecTMPDec);
+    DelTensorBuf(maskEncDecTMPEnc);
+    delete[] dims;
+}
+/* 
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -31,6 +31,9 @@
 namespace transformer
 {

+/* a transformer model that keeps parameters of the encoder,
+   the decoder and the output layer (softmax). Also, it creates
+   the network used in transformer. */
 class T2TModel
 {
 public:
@@ -78,7 +81,21 @@ public:
    void MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool isTraining);

    /* make the network for machine translation (with the output softmax layer) */
-    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+    void MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, 
+                XTensor &paddingEnc, XTensor &paddingDec, bool isTraining);
+
+    /* make the mask for training MT models */
+    void MakeMTMask(XTensor &inputEnc, XTensor &inputDec, 
+                    XTensor &paddingEnc, XTensor &paddingDec, 
+                    XTensor &maskEnc, XTensor &maskDec, XTensor &maskEncDec);
+    
+    /* make the mask of the encoder */
+    void MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &maskEnc);
+    
+    /* make the mask of the decoder */
+    void MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
+                       XTensor &paddingEnc, XTensor &paddingDec,
+                       XTensor &maskDec, XTensor &maskEncDec);

    /* get parameter matrics */
    void GetParams(XList &list);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,8 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
 }

 }
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -176,6 +176,9 @@ public:
    /* indicates whether we intend to debug the net */
    bool isDebugged;

+    /* bucket size */
+    int bucketSize;
+
 public:
    /* constructor */
    T2TTrainer();
@@ -205,10 +208,10 @@ public:
    int LoadBatch(FILE * file, bool isLM,
                  XTensor * batchEnc, XTensor * paddingEnc, 
                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold,
+                  XTensor * gold, XTensor * label,
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &wCount,
+                  bool isSorted, int &ws, int &wCount,
                  int devID, XMem * mem, 
 				  bool isTraining);

@@ -216,7 +219,7 @@ public:
    int LoadBatchLM(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc,
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
                    int devID, XMem * mem, 
@@ -226,9 +229,9 @@ public:
    int LoadBatchMT(FILE * file, 
                    XTensor * batchEnc, XTensor * paddingEnc, 
                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold,
+                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
+                    bool isSorted, int &ws, int &wCount,
                    int devID, XMem * mem, 
 					bool isTraining);


--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -37,8 +37,6 @@ int TransformerMain(int argc, const char ** argv)
    if(argc == 0)
        return 1;

-    fprintf(stderr, "%e\n", log(1e-8F));
-
    char ** args = new char*[argc];
    for(int i = 0; i < argc; i++){
        args[i] = new char[strlen(argv[i]) + 1];
@@ -67,17 +65,20 @@ int TransformerMain(int argc, const char ** argv)
    T2TModel model;
    model.InitModel(argc, args);

+    //if(strcmp(modelFN, ""))
+    //    model.Read(modelFN);
+    
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    
    /* save the final model */
-    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        model.Dump(modelFN);
+    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+        //model.Dump(modelFN);
    
    /* load the model if neccessary */
-    if(strcmp(modelFN, ""))
-        model.Read(modelFN);
+    //if(strcmp(modelFN, ""))
+        //model.Read(modelFN);

    T2TTrainer tester;
    tester.Init(argc, args);

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,6 +30,7 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./loss/CrossEntropy.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -47,15 +47,8 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
 extern TENSOR_DATA_TYPE GetDataType(const char * typeName);

 /* data conversion (for lower precision computation) */
-inline unsigned short cal_complement(unsigned short sig, unsigned short tal);
-unsigned short Float16Add(unsigned short a, unsigned short b);
-unsigned short Float16Sub(unsigned short a, unsigned short b);
-unsigned short Float16Mul(unsigned short a, unsigned short b);
-unsigned short Float16Div(unsigned short a, unsigned short b);
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-unsigned short FloatbitsToHalfbits(float ff);
-float HalfbitsToFloatbits(unsigned short h);
 void ConvertDataType(int devID, 
                     void * s, TENSOR_DATA_TYPE typeS, 
                     void * t, TENSOR_DATA_TYPE typeT, int size);

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -266,6 +266,10 @@ XDevManager::XDevManager()
 {
    Clear();
    Init();
+
+#ifndef USE_CPP11
+    fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
+#endif
 }

 /* de-constructor */

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -43,13 +43,17 @@
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

+#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
+#define USE_CPP11
+#endif
+
 #define _XINLINE_  

 //#define DOUBELPRICSION

 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -308,6 +308,27 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
 }

 /*
+create a hyperedge with two input tensors and a output tensor
+>> t1 - a tail tensor
+>> t2 - the second tail tensor
+>> t3 - the third tail tensor
+>> h - head tensor
+>> id - id of the edge type
+*/
+void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,XTensor * h, int id)
+{
+    if (h == NULL)
+        return;
+
+    XList list(3);
+    list.Add(t1);
+    list.Add(t2);
+    list.Add(t3);
+
+    MakeLink(&list, h, id);
+}
+
+/* 
 create a hyper edge with a list of tensors and a output tensor 
 >> list - a list of input tensors
 >> h - head tensor
@@ -509,6 +530,88 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
    }
 }

+
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -635,5 +738,28 @@ void XLink::ShowNode(FILE * file, XTensor * node)
    fprintf(stderr, "\n");
 }

+/* 
+search for a node in a top-down manner by its name 
+>> top - the top most node
+<< return - the node we found
+*/
+/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
+{
+    if(!strcmp(top->name, name))
+        return top;
+
+    XLink &incoming = top->income;
+
+    for(int i = 0; i < incoming.tailNum; i++){
+        XTensor * child = incoming.tails[i];
+        XTensor * hit = SearchNode(child, name);
+        if(hit != NULL)
+            return hit;
+    }
+
+    return NULL;
+}*/
+
+    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;

-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64

 /*
@@ -138,6 +138,10 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);

+    /* create a hyper edge with two input tensors and a output tensor */
+    static
+    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);
+
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
    void MakeLink(const XList * list, XTensor * h, int id);
@@ -170,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);

+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
+
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);
@@ -181,6 +189,10 @@ struct XLink
    /* show a node */
    static
    void ShowNode(FILE * file, XTensor * node);
+
+    /* search a node in a top-down manner by its name */
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,14 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
+        else if (type == MATH_MULANDSHIFT)
+            return "M_OPERATION";
        else if (type == MATH_SIGN)
            return "M_SIGN";
        else if (type == MATH_SUB)
@@ -107,16 +115,18 @@ const char * GetOPName(int type)
 			return "G_INDEXTOONEHOT";
 		else if (type == GETANDSET_ONEHOTTOINDEX)
 			return "G_ONEHOTTOINDEX";
+		else if (type == GETANDSET_SELECT)
+			return "G_SELECT";
 	}
 	else if ((type & SHAPE_BASE) != 0) {
-        if (type == GETANDSET_SELECT)
-            return "G_SELECT";
-        else if (type == MOVEMENT_COPYINDEXED)
+        if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -158,6 +168,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -57,7 +57,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_NORMALIZE          MATH_NEGATE + 1
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define MATH_SIGN               MATH_SCALEANDSHIFT + 1
+#define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
@@ -84,8 +89,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1

-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -111,6 +117,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1

+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
+
 /* get operator name */
 const char * GetOPName(int type);


--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -48,7 +48,6 @@
 #include "core/math/ScaleAndShift.h"
 #include "core/getandset/SetData.h"
 #include "function/Identity.h"
-#include "core/getandset/ConvertDataType.h"

 #ifdef USE_CUDA

@@ -60,7 +59,6 @@
 #include "core/utilities/FlushToMem.cuh"
 #include "core/utilities/SetAscendingOrder.cuh"

-
 #endif

 /* the nts (NiuTrans.Tensor) namespace */
@@ -70,8 +68,6 @@ int tensorIDGlobal = 0;
 MUTEX_HANDLE tensorMutex;
 XTensor NULLTensor;

-#define RAND_MAX16 0xff
-
 /* generate a tensor id */
 int MakeTensorID()
 {
@@ -196,6 +192,36 @@ XTensor::XTensor(const XTensor &reference)
    isTmp  = reference.isTmp;
 }

+/* copy constructor (with right value reference) */
+#ifdef USE_CPP11
+XTensor::XTensor(const XTensor &&reference)
+{
+    Init();
+    SetDataPointer();
+    id = MakeTensorID();
+    ShallowCopy(reference);
+    data = NULL;
+    dataHost = NULL;
+    
+    devID = reference.devID;
+    mem = reference.mem;
+    data = reference.data;
+    signature = reference.signature;
+        
+    /* what we really want to do is "reference.data = NULL;"
+       As "reference" is constant, we cannot reset reference.data
+       here. So we save the ADDRESS of reference.data in
+       reference.dataP, and do this work by updating "*reference.dataP".
+       This is VERY trick and might not be the best solution :) */
+    *reference.dataP = NULL;
+
+    XLink::Replace(&reference, this);
+
+    isInit = true;
+    isTmp  = reference.isTmp;
+}
+#endif
+
 /* de-constructor */
 XTensor::~XTensor()
 {
@@ -215,7 +241,6 @@ XTensor::~XTensor()
        
        XLink::Replace(this, newTensor);
    }
-    
    XLink::ClearOutgoing(this);
    XLink::ClearIncoming(this);
    
@@ -373,50 +398,97 @@ XTensor& XTensor::operator= (const XTensor& tensor)
    return *this;
 }

+/* overloading of the equal-sign (with right value reference) */
+XTensor& XTensor::operator= (const XTensor&& tensor)
+{
+    /* we must make a hard copy of the tensor if it is the input
+       of another node. */
+    if(outgo.tailNum > 0){
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(dims, dimSize, order * sizeof(int));
+        dims[0] = -dims[0];
+        
+        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        newTensor->SetTMPFlag();
+        newTensor->data = data;
+        newTensor->dataHost = dataHost;
+        newTensor->signature = tensor.signature;
+        
+        XLink::Replace(this, newTensor);
+        XLink::ClearOutgoing(this);
+        XLink::ClearIncoming(this);
+        newTensor->ShallowCopy(this);
+
+        data = NULL;
+        dataHost = NULL;
+    }
+    
+    DestroyData();
+
+    ShallowCopy(tensor);
+    
+    isInit = true;
+    devID = tensor.devID;
+    mem  = tensor.mem;
+    data = tensor.data;
+    signature = tensor.signature;
+        
+    /* what we really want to do is "reference.data = NULL;"
+       As "reference" is constant, we cannot reset reference.data
+       here. So we save the ADDRESS of reference.data in
+       reference.dataP, and do this work by updating "*reference.dataP".
+       This is VERY trick and might not be the best solution :) */
+    *tensor.dataP = NULL;
+
+    XLink::Replace(&tensor, this);
+
+    return *this;
+}
+
 /* overloading of the plus-sign */
-XTensor XTensor::operator+ (const XTensor& tensor)
+XTensor XTensor::operator+ (const XTensor& tensor) const
 {
    return Sum(*this, tensor);
 }

 /* overloading of the plus-sign */
-XTensor XTensor::operator+ (const DTYPE shift)
+XTensor XTensor::operator+ (const DTYPE shift) const 
 {
    return ScaleAndShift(*this, 1, shift);
 }

 /* overloading of the multiply-sign */
-XTensor XTensor::operator* (const XTensor& tensor)
+XTensor XTensor::operator* (const XTensor& tensor) const
 {
    return Multiply(*this, tensor);
 }

 /* overloading of the multiply-sign */
-XTensor XTensor::operator* (const DTYPE scale)
+XTensor XTensor::operator* (const DTYPE scale) const
 {
    return ScaleAndShift(*this, scale, 0);
 }

 /* overloading of the minus-sign */
-XTensor XTensor::operator- (const XTensor& tensor)
+XTensor XTensor::operator- (const XTensor& tensor) const
 {
    return Sub(*this, tensor);
 }

 /* overloading of the minus-sign */
-XTensor XTensor::operator- (const DTYPE shift)
+XTensor XTensor::operator- (const DTYPE shift) const
 {
    return ScaleAndShift(*this, 1, -shift);
 }

 /* overloading of the division-sign */
-XTensor XTensor::operator/ (const XTensor& tensor)
+XTensor XTensor::operator/ (const XTensor& tensor) const
 {
    return Div(*this, tensor);
 }

 /* overloading of the division-sign */
-XTensor XTensor::operator/ (const DTYPE scale)
+XTensor XTensor::operator/ (const DTYPE scale) const
 {
    return ScaleAndShift(*this, (DTYPE)1/scale, 0);
 }
@@ -426,7 +498,7 @@ linear transformation b = a * \scale + \shift
 >> scale - the slope
 >> shift - the intercept
 */
-XTensor XTensor::Lin(DTYPE scale, DTYPE shift)
+XTensor XTensor::Lin(DTYPE scale, DTYPE shift) const
 {
    return Linear(*this, scale, shift);
 }
@@ -462,6 +534,37 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
    return true;
 }

+bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
+{
+    if (a == NULL || b == NULL)
+        return false;
+
+    if ((a->order - 1) != b->order)
+        return false;
+
+    for (int i = 0; i < b->order; i++) {
+        if (i < dim) {
+            if (a->dimSize[i] != b->dimSize[i])
+                return false;
+        }
+        else if (i >= dim) {
+            if (a->dimSize[i+1] != b->dimSize[i])
+                return false;
+        }
+    }
+
+    if(a->dataType != b->dataType)
+        return false;
+
+    if(a->denseRatio != b->denseRatio)
+        return false;
+
+    if(a->isSparse != b->isSparse)
+        return false;
+
+    return true;
+}
+
 /* 
 judge whether the three matrices are in the same type and size 
 >> a - input tensor
@@ -714,15 +817,6 @@ void XTensor::SetDataRand(DTYPE lower, DTYPE upper)
            *((double*)d + i) = lower + variance * rand() / RAND_MAX;
        }
    }
-    else if (dataType == X_FLOAT16) {  
-        unsigned short random;    
-        unsigned short ulower = FloatToFloat16(lower), uvariance = FloatToFloat16(variance);
-        d = new unsigned short[unitNum];
-        for (int i = 0; i < unitNum; i++) {
-            random = FloatToFloat16(rand() % RAND_MAX16 * 1.0 / RAND_MAX16);
-            *((unsigned short*)d + i) = Float16Add(ulower, Float16Mul(uvariance, random));
-        }
-    }
    else {
        ShowNTErrors("Data type must be X_FLOAT or X_Double!");
    }
@@ -1634,17 +1728,6 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg, 
                    fprintf(file, " %d", f);
            }
        }
-        else if (dataType == X_FLOAT16) {
-            int end = MIN(n > 0 ? beg + n : beg + unitNum, unitNum);
-            for (int i = beg; i < end; i++) {
-                unsigned short f = ((unsigned short*)d)[i];
-                if (i == beg)
-                    fprintf(file, "%u", f);
-                else
-                    fprintf(file, " %u", f);
-
-            }
-        }
        else
            ShowNTErrors("TODO!");
    }
@@ -1681,22 +1764,9 @@ dump data to a file
 */
 void XTensor::Dump(const XTensor * tensor, FILE * file, const char * label, const int n, const int beg, const int verbose)
 {
-    if (tensor->dataType == X_FLOAT)
-    {
    XTensor a(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
    _CopyValues(tensor, &a);
    a.Dump(file, label, n, beg, verbose);
-    }
-    else if (tensor->dataType == X_FLOAT16)
-    {
-        XTensor a(tensor->order, tensor->dimSize, X_FLOAT, tensor->denseRatio, tensor->devID, tensor->mem);
-        _ConvertDataType(tensor, &a);
-        a.Dump(file, label, n, beg, verbose);
-    }
-    else 
-    {
-        ShowNTErrors("TO DO!");
-    }
 }

 /* 
@@ -1774,14 +1844,6 @@ void XTensor::Read(FILE * file, const char * label)
                }
            }
        }
-        else if (dataType == X_FLOAT16) {
-            for (int i = 0; i < unitNum; i++) {
-                unsigned short * f = ((unsigned short*)data) + i;
-                if (fscanf(file, "%u", f) < 1) {
-                    ShowNTErrors("Incorrect tensor format!");
-                }
-            }
-        }
        else {
            ShowNTErrors("TODO!");
        }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -189,6 +189,11 @@ public:
    /* copy constructor */
    XTensor(const XTensor &reference);

+    /* copy constructor (with right value reference) */
+    #ifdef USE_CPP11
+    XTensor(const XTensor &&reference);
+    #endif
+
    /* de-constructor */
    ~XTensor();

@@ -204,32 +209,37 @@ public:
    /* overloading of the equal-sign */
    XTensor& operator= (const XTensor &tensor);

+    /* overloading of the equal-sign (with right value reference) */
+    #ifdef USE_CPP11
+    XTensor& operator= (const XTensor &&tensor);
+    #endif
+
    /* overloading of the plus-sign */
-    XTensor  operator+ (const XTensor &tensor);
+    XTensor  operator+ (const XTensor &tensor) const;
    
    /* overloading of the plus-sign */
-    XTensor  operator+ (const DTYPE shift);
+    XTensor  operator+ (const DTYPE shift) const;

    /* overloading of the multiply-sign */
-    XTensor  operator* (const XTensor &tensor);
+    XTensor  operator* (const XTensor &tensor) const;
    
    /* overloading of the multiply-sign */
-    XTensor  operator* (const DTYPE scale);
+    XTensor  operator* (const DTYPE scale) const;

    /* overloading of the minus-sign */
-    XTensor  operator- (const XTensor &tensor);
+    XTensor  operator- (const XTensor &tensor) const;
    
    /* overloading of the minus-sign */
-    XTensor  operator- (const DTYPE shift);
+    XTensor  operator- (const DTYPE shift) const;

    /* overloading of the division-sign */
-    XTensor  operator/ (const XTensor &tensor);
+    XTensor  operator/ (const XTensor &tensor) const;
    
    /* overloading of the division-sign */
-    XTensor  operator/ (const DTYPE scale);
+    XTensor  operator/ (const DTYPE scale) const;

    /* linear transformation */
-    XTensor Lin(DTYPE scale, DTYPE shift = 0);
+    XTensor Lin(DTYPE scale, DTYPE shift = 0) const;

    /* judge whether the two matrices are in the same type and size */
    static
@@ -239,6 +249,10 @@ public:
    static
    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);

+    /* judge whether b is the reduced shape of a ?? */
+    static
+    bool IsReduceShaped(const XTensor * a, const XTensor * b, int dim);
+
    /* set the size of each dimension */
    void SetDim(int * myDimSize);


--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -28,6 +28,7 @@

 #include "arithmetic/Div.h"
 #include "arithmetic/DivDim.h"
+#include "arithmetic/Mask.h"
 #include "arithmetic/MatrixMul.h"
 #include "arithmetic/MatrixMul2D.h"
 #include "arithmetic/MatrixMul2DMultiTheading.h"
@@ -44,12 +45,14 @@
 #include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
+#include "arithmetic/MulAndShift.h"

 #include "getandset/ConvertDataType.h"
 #include "getandset/OnehotAndIndex.h"
 #include "getandset/Select.h"
 #include "getandset/SetData.h"

+#include "math/Binary.h"
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"

--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -214,4 +214,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
    return c;
 }

+/*
+element-wise division of two tensors
+
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetDivDimIndex(a, b);
+
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Div function */
+        _Div(&a, &b, &c, 0, leadingDim);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -43,15 +43,6 @@ void KernelDivElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
        c[i] = a[i] / b[i];
 }

-__global__
-void KernelDivElementWiseHalf(__half * a, __half * b, __half * c, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] / b[i];
-}
-
 /*
 division of data arrays in a element-wise manner c(i) = a(i)/b(i) + \alpha*c(i)
 >> a - data array a
@@ -69,18 +60,6 @@ void KernelDivElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
        c[i] = a[i] / b[i] + alpha * c[i];
 }

-__global__
-void KernelDivElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    __half alpha1 = __float2half(alpha);
-    if (i < size)
-        c[i] = a[i] / b[i] + alpha1 * c[i];
-#endif
-}
-
 /*
 division of two tensors in a element-wise manner c(i) = a(i)/b(i).
 Note that a and b can be of different sizes here, i.e.,
@@ -201,25 +180,6 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in
                }
            }
        }
-        else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
-
-            int cudaGridSize[3];
-            int cudaBlockSize[3];
-
-            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
-                GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
-
-                if (alpha == 0)
-                    KernelDivElementWiseHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum);
-                else
-                    KernelDivElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

+/*
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the element
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -163,4 +163,35 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    return c;
 }

+/*
+tensor division
+
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+>> requireLink - if add operation to network
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
+    
 }
--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -53,6 +53,14 @@ we make a new tensor c to keep the result and return it
 */
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);

+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+    
 } // namespace nts(NiuTrans.Tensor)

 #endif // __DIVDIM_H__
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Mask.h"
+#include "Mask.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
+    //    "Unmatched tensors in addition!");
+
+    if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && mask->devID >= 0) ||
+                (a->devID >= 0 && mask->devID < 0) ||
+                (a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaMask(a, mask, c, alpha);
+        }
+        else
+            _CudaMask(a, mask, c, alpha);
+
+#endif
+    }
+    else {
+        if (!a->isSparse && !mask->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+
+            if (a->dataType == DEFAULT_DTYPE &&
+                mask->dataType == X_INT &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                int * maskp = (int*)mask->data;
+                DTYPE * cp = (DTYPE*)c->data;
+
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+
+                        if (maskp[i + 1] == 0) {
+                            cp[i + 1] = alpha;
+                        }
+                        else {
+                            cp[i + 1] = ap[i + 1];
+                        }
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
+{
+    _Mask(a, mask, a, alpha);
+}
+
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+
+    /* call _Sum function */
+    _Mask(&a, &mask, &c, alpha);
+
+    /* tensor connections */
+    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    //XLink::AddParamToHead(&c, alpha);
+    // TODO!!
+    ShowNTErrors("TODO!");
+
+    return c;
+}
+
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+mask entries of a given tensor (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> mask - mask matrix
+>> c - where we put masked a
+>> size - the size of a/b/c
+>> alpha - value
+*/
+__global__
+    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (mask[i] == 0) {
+            c[i] = alpha;
+        }
+        else {
+            c[i] = a[i];
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (cuda version)
+>> a - a tensor
+>> mask - mask tensor
+>> c - where we put masked a
+>> alpha - value 
+*/
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
+    //    "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
+        "The tensors must be on the same!");
+
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+
+    if (!a->isSparse && !mask->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+
+        if (a->dataType == DEFAULT_DTYPE &&
+            mask->dataType == X_INT &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+
+    XDevice::SetGPUDevice(devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#ifndef __MASK_CUH__
+#define __MASK_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* mask entries of a given tensor (cuda version) */
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MASK_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#ifndef __MASK_H__
+#define __MASK_H__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MASK_H__
+
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -62,11 +62,11 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    
    /* we transform a higher order tensor to a matrix to kill the number
       of calls of matrix multiplication */
-    if (transposedA == X_NOTRANS && a->order > 2 && b->order == 2) {
+    if(transposedA == X_NOTRANS && a->order > 2 && b->order == 2){
        int ncolA = a->dimSize[a->order - 1];
        int ncolC = c->dimSize[c->order - 1];
-        XTensor * a2 = NewTensor2D(a->unitNum / ncolA, -ncolA, a->dataType, a->devID, a->mem);
-        XTensor * c2 = NewTensor2D(c->unitNum / ncolC, -ncolC, c->dataType, c->devID, c->mem);
+        XTensor * a2 = NewTensor2D(a->unitNum/ncolA, -ncolA, a->dataType, a->devID, a->mem);
+        XTensor * c2 = NewTensor2D(c->unitNum/ncolC, -ncolC, c->dataType, c->devID, c->mem);
        a2->data = a->data;
        c2->data = c->data;
        _MatrixMul2D(a2, transposedA, b, transposedB, c2, alpha, beta, parallelRunner);
@@ -173,7 +173,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 #endif
    }
    else {
-        //CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
+        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
        _MatrixMulBatchedCPU(aList, transposedA,
                             bList, transposedB,
                             cList, alpha, beta);
@@ -202,8 +202,44 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+{
+    if (!(a && b && c))
+        return false;
+
+    if(!(a->dataType == b->dataType && a->dataType == c->dataType))
+        return false;
+
+    if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
+        return false;
+
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a->order + b->order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a->order; i++)
+        dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
+    for (int i = 2; i < b->order; i++)
+        dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != c->dimSize[i])
+            return false;
+    }
+
+    return true;
+}
+
 /*
-matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
+matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it

 For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
@@ -266,6 +302,53 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    return c;
 }

+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
+    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+
+    if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
+
+        int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+        int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+        int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+        int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+
+    }
+
+    /* call _MatrixMul function */
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+        XLink::AddParamToHead(&c, alpha);
+    }
+
+}
+
 /* 
 matrix multiplication with no transposition c = a * b * alpha
 >> a - tensor a
@@ -316,4 +399,53 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    return c;
 }

-}// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+
+    if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
+
+        int an = a.dimSizeRDI[1];
+        int am = a.dimSizeRDI[0];
+        int bn = b.dimSizeRDI[1];
+        int bm = b.dimSizeRDI[0];
+
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+
+    }
+
+    /* call _MatrixMul function */
+    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHead(&c, alpha);
+    }
+
+}
+
+} // namespace nts(NiuTrans.Tensor)
+
+
+
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -59,10 +59,17 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
 XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+
 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
+    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -78,11 +78,19 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in multiplication!");

+        if (a->dataType == DEFAULT_DTYPE &&
+            b->dataType == DEFAULT_DTYPE &&
+            c->dataType == DEFAULT_DTYPE)
+        {
            if (useBLAS)
                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
            else
                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
-
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
    }
    /* a dense matrix multiply a sparse matrix */
    else if (!a->isSparse && b->isSparse) {

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -156,6 +156,7 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        if (stream != NULL)
            cublasSetStream(*handle, stream->stream);

+        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
                               b->data, transposedB, a->dataType, c->data, c->dataType,
                               a->dimSize[0], a->dimSize[1], 
@@ -163,6 +164,11 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                               c->dimSize[0], c->dimSize[1],
                               alpha, beta);
        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
    /* a dense matrix multiply a sparse matrix */
    else if (!a->isSparse && b->isSparse) {


--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+
+#include "../../XTensor.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
+#include "MulAndShift.h"
+#include "MatrixMul.h"
+#include "Sum.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
+>> a - a tensor
+>> b - another tensor for sum
+*/
+int GetSumIndex(const XTensor &a, const XTensor &b)
+{
+    if (a.order < b.order)
+        return -1;
+    if (XTensor::IsSameShaped(&a, &b))
+        return -1;
+
+    int hitCount = 0;
+    int hitDim = -1;
+    for (int i = 0; i < b.order; i++) {
+        if (b.dimSize[b.order - 1 - i] == 1)
+            continue;
+        else if (b.dimSize[b.order - 1 - i] == a.dimSize[a.order - 1 - i]) {
+            hitCount++;
+            hitDim = a.order - b.order + i;
+        }
+    }
+
+    if (hitCount == 1)
+        return hitDim;
+    else
+        return -1;
+}
+
+/*
+operation c = x * w + b  MulAndShift
+>> x - tensor x
+>> w - tensor w
+>> b - tensor b
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
+
+    int xn = x.dimSizeRDI[1];
+    int xm = x.dimSizeRDI[0];
+    int wn = w.dimSizeRDI[1];
+    int wm = w.dimSizeRDI[0];
+
+    CheckNTErrors(xm == wn, "Unmatched tensors in multiplication!");
+
+    int order = x.order + w.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < x.order; i++)
+        dimSize[sub++] = x.dimSizeRDI[x.order + 1 - i];
+    for (int i = 2; i < w.order; i++)
+        dimSize[sub++] = w.dimSizeRDI[w.order + 1 - i];
+    dimSize[sub++] = xn;
+    dimSize[sub++] = wm;
+
+    float dr = (!x.isSparse || !w.isSparse) ? 1.0F : MAX(x.denseRatio, w.denseRatio);
+
+    XTensor * tmp = NewTensorBuf(order, dimSize, x.dataType, dr, x.devID, x.mem);
+
+    /* call _MatrixMul function */
+    _MatrixMul(&x, X_NOTRANS, &w, X_NOTRANS, tmp, alpha, 0, parallelRunner);
+
+    XTensor c(tmp);
+    c.SetTMPFlag();
+
+    int n = GetSumIndex(tmp, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(tmp, &b, &c);
+
+        // TODO!!
+        ShowNTErrors("TODO!");
+
+    }
+    else if (n >= 0 && n < tmp->order) {
+        /* call _SumDim function */
+        _SumDim(tmp, &b, &c, n);
+
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+    /* tensor connections */
+    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
+    XLink::AddParamToHeadInt(&c, n);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    //XLink::AddParamToHead(&c, beta);
+
+    /* destroy variables */
+    delete[] dimSize;
+    DelTensorBuf(tmp);
+
+    return c;
+
+}
+
+
+
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2019-02-27
+*/
+
+#ifndef __MULANDSHIFT_H__
+#define __MULANDSHIFT_H__
+
+#include "../../XTensor.h"
+#include "../CHeader.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+
+XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __OPERATION_H__
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -215,4 +215,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    return c;
 }

+/*
+element-wise product of two tensors
+
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetMultiplyDimIndex(a, b);
+
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Multiply function */
+        _Multiply(&a, &b, &c, 0, leadingDim);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -43,15 +43,6 @@ void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
        c[i] = a[i] * b[i];
 }

-__global__
-void KernelMulElementWiseHalf(__half * a, __half * b, __half * c, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] * b[i];
-}
-
 /*
 multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i)
 >> a - data array a
@@ -69,18 +60,6 @@ void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alp
        c[i] = a[i] * b[i] + alpha * c[i];
 }

-__global__
-void KernelMulElementWiseV2Half(__half * a, __half * b, __half * c, int size, DTYPE alpha)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    __half alpha1 = __float2half(alpha);
-    if (i < size)
-        c[i] = a[i] * b[i] + alpha1 * c[i];
-#endif
-}
-
 /*
 multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i).
 Note that a and b can be of different sizes here, i.e.,
@@ -201,25 +180,6 @@ void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alph
                }
            }
        }
-        else if (a->dataType == X_FLOAT16 && b->dataType == X_FLOAT16) {
-            int cudaGridSize[3];
-            int cudaBlockSize[3];
-
-            if (a->unitNum == c->unitNum && b->unitNum == c->unitNum) {
-                GDevs.GetCudaThread(a->devID, c->unitNum, cudaGridSize, cudaBlockSize);
-                dim3 blocks(cudaGridSize[0]), threads(cudaBlockSize[0]);
-
-                if (alpha == 0)
-                    KernelMulElementWiseHalf << <blocks, threads >> >((__half *)a->data, (half *)b->data, (half *)c->data, c->unitNum);
-                else
-                    KernelMulElementWiseV2Half << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, c->unitNum, alpha);
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -163,6 +163,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
 }

 /*
+tensor multiplication
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> requireLink - if add operation to network
+*/
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, 0);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
+
+/* 
 tensor broadcast multiplication
 c = a * b + c * \beta 
 where some of dimensions of b can be of size 1
@@ -302,4 +332,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
    return c;
 }

+/* 
+tensor broadcast multiplication
+c = a * b + c * \beta 
+where some of dimensions of b can be of size 1
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> requireLink - if add operation to network
+*/
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumBroadcast function */
+    _MultiplyBroadcast(&a, &b, &c, 0);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
+
 }
--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -22,10 +22,6 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "MultiplyDim.cuh"
-#include "../getandset/ConvertDataType.h"
-#include "../arithmetic/XTensorBLAS.h"
-#include "../math/ScaleAndShift.h"
-#include "cuda_fp16.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -65,25 +61,6 @@ void KernelMultiplyWithRow(T * a, T * b, T * c, int rowNum, int colNum, T alpha)
        c[offset] = a[offset] * bv[threadIdx.x];
 }

-__global__
-void KernelMultiplyWithRowHalf(__half * a, __half * b, __half * c, int rowNum, int colNum)
-{
-    __shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-    int col = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-    if (col >= colNum || row >= rowNum)
-        return;
-
-    if (threadIdx.y == 0)
-        bv[threadIdx.x] = b[col];
-
-    __syncthreads();
-
-    int offset = colNum * row + col;
-    c[offset] = a[offset] * bv[threadIdx.x];
-}
-
 /*
 tensor multiplication of a tensor and a colum vector
 c = a * b + \alpha * c
@@ -125,30 +102,6 @@ void KernelMultiplyWithCol(T * a, T * b, T * c, int rowNum, int colNum, int bloc
        c[offset] = a[offset] * bv[threadIdx.y];
 }

-__global__
-void KernelMultiplyWithColHalf(__half * a, __half * b, __half * c, int rowNum, int colNum, int blockSize, int blockNum)
-{
-    __shared__ __half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-
-    int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-    int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-    int col = colIndex % colNum;
-    int block = colIndex / colNum;
-
-    if (row >= rowNum || block >= blockNum)
-        return;
-
-    if (threadIdx.x == 0)
-        bv[threadIdx.y] = b[row];
-
-    __syncthreads();
-
-    int offset = block * blockSize + row * colNum + col;
-
-    c[offset] = a[offset] * bv[threadIdx.y];
-}
-
 /*
 tensor multiplication

@@ -182,13 +135,14 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        else if (i < n)
            blockNum *= a->dimSize[i];
    }
+
    int cudaGrids[3];
    int cudaBlocks[3];
+
    int devIDBackup = 0;
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-
        if (stride > 1) {
            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
            if(alpha == (DTYPE)0.0F)
@@ -202,8 +156,8 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
        }
        else if (stride == 1) {
            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-            if (alpha == (DTYPE)0.0F)
-                KernelMultiplyWithRow<DTYPE, false> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+            if(alpha == (DTYPE)0.0F)
+                KernelMultiplyWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
                                                       blockNum, blockSize, alpha);
            else
@@ -211,39 +165,14 @@ void _CudaMultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, 
                                                     ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, 
                                                       blockNum, blockSize, alpha);
        }
-
        else {
            ShowNTErrors("Something is wrong!");
        }
-
-    }
-    else if (a->dataType == X_FLOAT16) {
-        
-        if (stride > 1) {
-            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-
-            KernelMultiplyWithColHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((__half *)a->data, (__half *)b->data, (__half *)c->data,
-                blockSize, stride, blockSize * stride, blockNum);
-            
-        }
-        else if (stride == 1) {
-            /*__half alpha1 = float2half_rn(alpha);*/
-            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-
-            KernelMultiplyWithRowHalf<< <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-            ((__half *)a->data, (__half *)b->data, (__half *)c->data,
-                blockNum, blockSize);
-
-        }
-        else {
-            ShowNTErrors("Something is wrong!");
-        }
-
    }
    else {
        ShowNTErrors("TODO!");
    }
+
    BacktoCudaDev(a->devID, devIDBackup);
 }


--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);

+/* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting */
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
   we return the resulting tensor here */
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

+/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MULTIPLYDIM_H__
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a)
    return b;
 }

+/*
+set every entry to its minus value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Negate(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Negate function */
+    _Negate(&a, &b);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Negate(const XTensor & a);

+/* set every entry to its minus value */
+void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a)

    return b;
 }
+
+/*
+set every entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Sign(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Sign function */
+    _Sign(&a, &b);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -23,7 +23,6 @@
 #include "../../XTensor.h"
 #include "Sign.h"
 #include "Sign.cuh"
-#include "cuda_fp16.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -57,25 +56,9 @@ This is for float16 computation
 >> size - size of the data array
 */
 __global__
-void KernelSignHalf(__half * a, __half * b, int size)
+void KernelSign(__half * a, __half * b, int size)
 {
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-
-	__half zero = __float2half(0.0F);
-	__half one = __float2half(1.0F);
-	__half one_1 = __float2half(-1.0F);
-
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-	DTYPE flag = __half2float(a[i]);
-	if (i < size) {
-		if (flag > 0)
-			b[i] = one;
-		else if (flag < 0)
-			b[i] = one_1;
-		else
-			b[i] = zero;
-	}
-#endif
+    return;
 }

 /*
@@ -103,7 +86,7 @@ void _CudaSign(const XTensor * a, XTensor * b)
        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelSignHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Sign(const XTensor & a);

+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -194,4 +194,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/*
+tensor subtraction c = a - b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetSubDimIndex(a, b);
+
+    if (n == -1) {
+        /* call _Sub function */
+        _Sub(&a, &b, &c, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUB);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SubDim function */
+        _SubDim(&a, &b, &c, n, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
@@ -22,8 +22,6 @@
 #include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "Sub.cuh"
-#include "cuda_fp16.h"
-

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -48,30 +46,6 @@ void KernelSUB(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
 }

 /*
-subtraction of data arrays (CUDA Kernel) Half Precision
-c = a - b * \beta
->> a - A matrix
->> b - another matrix
->> c - where we put a-b
->> size - the size of a/b/c
->> beta - the coefficient
-*/
-__global__
-void KernelSUBHalf(half * a, half * b, half * c, int size, DTYPE beta)
-{
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-
-	__half beta1 = __float2half(beta);
-	int i = blockDim.x * blockIdx.x + threadIdx.x;
-	if (i < size)
-		c[i] = a[i] - b[i] * beta1;
-#endif
-}
-
-
-
-/*
 tensor subtraction c = a - b * \beta (cuda version)
 >> a - a tensor
 >> b - another tensor
@@ -105,22 +79,10 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
            dim3 threads(blockSize[0]);
            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
        }
-        else if(a->dataType == X_FLOAT16 &&
-				b->dataType == X_FLOAT16 &&
-				c->dataType == X_FLOAT16){
-			int gridSize[3], blockSize[3];
-
-			GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-			dim3 blocks(gridSize[0]);
-			dim3 threads(blockSize[0]);
-
-			KernelSUBHalf << <blocks, threads >> >((__half*)a->data, (__half*)b->data, (__half*)c->data, a->unitNum, beta);
-        }
        else {
            // TODO!!
            ShowNTErrors("TODO!");
        }
-
    }
    else {
        // TODO!!

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor subtraction c = a - b * \beta */
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUB_H__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -163,4 +163,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 	return c;
 }

+/*
+tensor subtraction
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Sub function */
+    _SubDim(&a, &b, &c, n, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+
 }
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
@@ -21,9 +21,6 @@

 #include "SubDim.cuh"
 #include "../../XDevice.h"
-#include "cuda_fp16.h"
-#include "device_launch_parameters.h"
-#include "../../XDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -40,10 +37,11 @@ where a is a tensor and b is a row vector
 >> colNum - number of columns of a and c (i.e., the size of b)
 >> beta - the scaling factor
 */
+template <class T, bool betaFired>
 __global__
-void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, DTYPE beta,bool betaFired)
+	void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
 {
-	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
 	int col = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;

@@ -62,59 +60,6 @@ void KernelSubWithRow(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, D
 		c[offset] = a[offset] - bv[threadIdx.x];
 }

-__global__
-void KernelSubWithRowHalf(half * a, half * b, half * c, int rowNum, int colNum, half beta, bool betaFired)
-{
-	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-	int col = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-	if (col >= colNum || row >= rowNum)
-		return;
-
-	if (threadIdx.y == 0)
-		bv[threadIdx.x] = b[col];
-
-	__syncthreads();
-
-	int offset = colNum * row + col;
-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.x] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.x];
-}
-
-//template <class T, bool betaFired>
-//__global__
-//void KernelSubWithRow(T * a, T * b, T * c, int rowNum, int colNum, DTYPE beta)
-//{
-//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-//	int col = blockDim.x * blockIdx.x + threadIdx.x;
-//	int row = blockDim.y * blockIdx.y + threadIdx.y;
-//
-//	if (col >= colNum || row >= rowNum)
-//		return;
-//
-//	if (threadIdx.y == 0)
-//		bv[threadIdx.x] = b[col];
-//
-//	__syncthreads();
-//
-//	T beta1;
-//	if (sizeof(T) - sizeof(half) == 0) {
-//		beta1 =__float2half(beta);
-//	}
-//	else {
-//		beta1 = beta;
-//	}
-//
-//	int offset = colNum * row + col;
-//	if (betaFired)
-//		c[offset] = a[offset] - bv[threadIdx.x] * beta1;
-//	else
-//		c[offset] = a[offset] - bv[threadIdx.x];
-//}
-
 /*
 tensor subtraction of a tensor and a colum vector
 c = a - b * \beta
@@ -128,38 +73,11 @@ where a is a tensor and b is a colum vector
 >> blockNum - number of matrics
 >> beta - the scaling factor
 */
-
-__global__
-void KernelSubWithCol(DTYPE * a, DTYPE * b, DTYPE * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta,bool betaFired)
-{
-	__shared__ DTYPE bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-
-	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-	int row = blockDim.y * blockIdx.y + threadIdx.y;
-
-	int col = colIndex % colNum;
-	int block = colIndex / colNum;
-
-	if (row >= rowNum || block >= blockNum)
-		return;
-
-	if (threadIdx.x == 0)
-		bv[threadIdx.y] = b[row];
-
-	__syncthreads();
-
-	int offset = block * blockSize + row * colNum + col;
-
-	if (betaFired)
-		c[offset] = a[offset] - bv[threadIdx.y] * beta;
-	else
-		c[offset] = a[offset] - bv[threadIdx.y];
-}
-
+template <class T, bool betaFired>
 __global__
-void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, int blockSize, int blockNum, half beta, bool betaFired)
+	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
 {
-	__shared__ half bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
+	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];

 	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
 	int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -183,44 +101,6 @@ void KernelSubWithColHalf(half * a, half * b, half * c, int rowNum, int colNum, 
 		c[offset] = a[offset] - bv[threadIdx.y];
 }

-//
-//template <class T, bool betaFired>
-//__global__
-//	void KernelSubWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, DTYPE beta)
-//{
-//	__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
-//
-//	int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
-//	int row = blockDim.y * blockIdx.y + threadIdx.y;
-//
-//	int col = colIndex % colNum;
-//	int block = colIndex / colNum;
-//
-//	if (row >= rowNum || block >= blockNum)
-//		return;
-//
-//	if (threadIdx.x == 0)
-//		bv[threadIdx.y] = b[row];
-//
-//	__syncthreads();
-//
-//	T beta1;
-//
-//	if (sizeof(T) - sizeof(half) == 0) {
-//		beta1 = __float2half(beta);
-//	}
-//	else {
-//		beta1 = beta;
-//	}
-//
-//	int offset = block * blockSize + row * colNum + col;
-//
-//	if (betaFired)
-//		c[offset] = a[offset] - bv[threadIdx.y] * beta1;
-//	else
-//		c[offset] = a[offset] - bv[threadIdx.y];
-//}
-
 /*
 tensor subtraction (cuda version)

@@ -265,72 +145,28 @@ void _CudaSubDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
 		if (stride > 1) {
 			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta,false);
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
 			else
-				KernelSubWithCol <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+				KernelSubWithCol<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockSize, stride, blockSize * stride, blockNum, beta,true);
+					                              blockSize, stride, blockSize * stride, blockNum, beta);
 		}
 		else if (stride == 1) {
 			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
 			if (beta == (DTYPE)1.0F)
-				KernelSubWithRow <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta,false);
+					                              blockNum, blockSize, beta);
 			else
-				KernelSubWithRow<<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
+				KernelSubWithRow<DTYPE, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
 				                                ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
-					                              blockNum, blockSize, beta,true);
-		}
-		else {
-			ShowNTErrors("Something is wrong!");
-		}
-	}
-
-	else if (a->dataType == X_FLOAT16) {
-		if (stride > 1) {
-			GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F){
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockSize, stride, blockSize * stride, blockNum, beta1, false);
-			}
-				
-			else {
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithColHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockSize, stride, blockSize * stride, blockNum, beta1, true);
-			}
-				
-		}
-		else if (stride == 1) {
-			GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
-			if (beta == (DTYPE)1.0F) {
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockNum, blockSize, beta1, false);
-			}
-			else{
-				unsigned short temp = FloatToFloat16(beta);
-				half beta1 = *((half *)&temp);
-				KernelSubWithRowHalf << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
-					((half*)a->data, (half*)b->data, (half*)c->data,
-						blockNum, blockSize, beta1, true);
-			}
-				
+					                              blockNum, blockSize, beta);
 		}
 		else {
 			ShowNTErrors("Something is wrong!");
 		}
-	
 	}
 	else {
 		ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);

+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -21,7 +21,6 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
-#include "../getandset/ConvertDataType.h"
 #include "../../XUtility.h"
 #include "../movement/CopyValues.h"
 #include "Sum.h"
@@ -38,58 +37,15 @@ tensor summation c = a + b * \beta
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors(a && b && c, "Empty tensor input!");
-    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
-                  "Unmatched tensors in addition!");
-
-
-    if (beta == 0) {
-        _CopyValues(a, c);
-        return;
-    }
-
-    XTensor b1(b->order, b->dimSize, a->dataType, b->denseRatio, b->devID, b->mem);
-    b1.SetTMPFlag();
-    _ConvertDataType(b, &b1);
-
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        if (a == c) {
-            int P2PAccesible = 0;
-#ifdef CUDA_UVA
-            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
-#endif
-            if ((a->devID < 0 && b->devID >= 0) ||
-                (a->devID >= 0 && b->devID < 0) ||
-                (a->devID >= 0 && b->devID >= 0 && a->devID != b->devID && !P2PAccesible))
-            {
-                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
-            }
-            else
-                _CudaSum(a, &b1, c, beta);
-        }
-        else
-            _CudaSum(a, &b1, c, beta);
-
-#endif
-    }
-    else { 
-        // TODO!!
-        ShowNTErrors("TODO!");
-    }
-}
-
-
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
                  "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
+                  "Unmatched tensors in addition!");

-    if (beta == 0) {
+    if(beta == 0){
        _CopyValues(a, c);
        return;
    }
@@ -243,4 +199,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/*
+tensor summation c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetSumDimIndex(a, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SumDim function */
+        _SumDim(&a, &b, &c, n, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -23,7 +23,6 @@
 #include "../../XUtility.h"
 #include "Sum.cuh"

-
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA
@@ -46,31 +45,6 @@ void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
        c[i] = a[i] + b[i] * beta;
 }

-__global__
-void KernelADDHalf(__half * a, __half * b, __half * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-#if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-
-    __half beta1 = __float2half(beta);
-
-    if (i < size)
-        c[i] = a[i] + b[i] * beta1;
-#endif
-}
-
-__global__
-void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] + b[i] * (int)beta;
-
-}
-
-
 /*
 tensor summation c = a + b * \beta (cuda version)
 >> a - a tensor
@@ -126,36 +100,6 @@ void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
                KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
            }
        }
-
-        else if (a->dataType == X_FLOAT16 &&
-                 b->dataType == X_FLOAT16 &&
-                 c->dataType == X_FLOAT16)
-        {
-            int gridSize[3], blockSize[3];
-
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-
-            //KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-            KernelADDHalf << <blocks, threads >> >((__half *)a->data, (__half *)b->data, (__half *)c->data, a->unitNum, beta);
-
-        }
-        else if (a->dataType == X_INT &&
-                 b->dataType == X_INT &&
-                 c->dataType == X_INT)
-        {
-            int gridSize[3], blockSize[3];
-
-            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-            dim3 blocks(gridSize[0]);
-            dim3 threads(blockSize[0]);
-
-            //KernelADD << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
-            KernelADDInt << <blocks, threads >> >((int *)a->data, (int *)b->data, (int *)c->data, a->unitNum, beta);
-
-        }
-
        else {
            // TODO!!
            ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -27,8 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* tensor summation c = a + b * \beta */
-void _MySum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

 /* 
@@ -43,6 +41,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor summation c = a + b * \beta */
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -64,6 +64,20 @@ void _SumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE bet
        return;
    }

+    /*int dims[MAX_TENSOR_DIM_NUM];
+    for(int i = 0; i < a->order; i++)
+        dims[i] = 1;
+    dims[n] = a->GetDim(n);
+
+    XTensor * b2 = NewTensor(a->order, dims, b->dataType, b->denseRatio, b->devID, b->mem);
+    _CopyValues(b, b2);
+
+    _SumBroadcast(a, b2, c, beta);
+
+    DelTensor(b2);
+
+    return;*/
+
    if(a->devID >= 0 || b->devID >= 0 || c->devID >= 0){
 #ifdef USE_CUDA
        _CudaSumDim(a, b, c, n, beta);
@@ -168,6 +182,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 }

 /*
+tensor summation 
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumDim function */
+    _SumDim(&a, &b, &c, n, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+
+/* 
 tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
 c = a + b * \beta

@@ -308,4 +353,30 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumBroadcast function */
+    _SumBroadcast(&a, &b, &c, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+    
 }
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cu
+++ b/source/tensor/core/math/Clip.cu
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cpp
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cpp
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cu
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cuh
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cuh
--- a/source/tensor/function/SoftmaxWithCrossEntropy.h
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/loss/CrossEntropy.cuh
+++ b/source/tensor/loss/CrossEntropy.cuh
--- a/source/tensor/loss/CrossEntropy.h
+++ b/source/tensor/loss/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TCrossEntropy.cpp
+++ b/source/tensor/test/TCrossEntropy.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h