Merge branch 'liyinqiao' into xiaotong-working

# Conflicts: # source/tensor/XDevice.cpp # source/tensor/XMem.cpp # source/tensor/XTensor.cpp

Merge branch 'liyinqiao' into xiaotong-working
# Conflicts: # source/tensor/XDevice.cpp # source/tensor/XMem.cpp # source/tensor/XTensor.cpp
314f4370 · liyinqiao · 7c17670d · 58181c8d · 314f4370 · 314f4370
Commit 314f4370 authored Jul 16, 2019 by liyinqiao
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */
 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@
 namespace nts{
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+        return;
+    }
+    gold = income.tails[1];
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+         leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+    node->visitMark = NODE_FINISHED;
+}
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)

--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,6 +35,14 @@ namespace nts{
 class XLossGrad
 {
 public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficient);
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
                 XTensor * dedy, XTensor * dedx, XTensor * padding,

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
+    else if(operID == MATH_SCALE)
+        GradScale(node, isEfficient);
+    else if(operID == MATH_DESCALE)
+        GradDescale(node, isEfficient);
+    else if(operID == MATH_SHIFT)
+        GradShift(node, isEfficient);
    else if(operID == MATH_SUB)
        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
@@ -719,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)
    XTensor * a = income.tails[0]; 
    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
        _Multiply(node->grad, b, a->grad, 1.0F);
+    }
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
        _Multiply(node->grad, a, b->grad, 1.0F);
+    }
    node->visitMark = NODE_FINISHED;
 }
@@ -888,88 +900,8 @@ gradient for normalize
 */
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
-    ShowNTErrors("This is really a bad piece of code!!!");
+    ShowNTErrors("TODO!");
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
-    XTensor * input = income.tails[0];
-    XTensor * mean = income.tails[1];
-    XTensor * var = income.tails[2];
-    XTensor * a = income.tails[3];
-    XTensor * b = income.tails[4];
-    XTensor * c = NewTensor(var);
-    XTensor * d = NewTensor(a);
-    XTensor * e = NewTensor(a);
-    XTensor * f = NewTensor(a);
-    XTensor * g = NewTensor(a);
-    XTensor * h = NewTensor(a);
-    XTensor * i = NewTensor(a);
-    XTensor * j = NewTensor(a);
-    XTensor * k = NewTensor(var);
-    XTensor * p = NewTensor(var);
-    XTensor * q = NewTensor(var);
-    XTensor * r = NewTensor(a);
-    XTensor * x = NewTensor(mean);
-    XTensor * y = NewTensor(mean);
-    XTensor * z = NewTensor(mean);
-    DTYPE epsilon = income.GetParam(1);
-    int dim = income.GetParamInt(0);
-    int n = a->GetDim(dim);
-    XNoder::MakeGrad(input);
-    XNoder::MakeGrad(mean);
-    XNoder::MakeGrad(var);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-    /* dEdinput */
-    _ScaleAndShift(var, c, 1.0F, epsilon);
-    _Unsqueeze(c, d, dim, n);
-    _Power(d, e, -0.5F);
-    _Multiply(a, e, f);
-    _Multiply(node->grad, f, input->grad, 1.0F);
-    /* dEdmean */
-    _ScaleAndShift(f, g, -1.0F);
-    _ReduceSum(g, x, dim);
-    _ReduceSum(node->grad, y, dim);
-    _Multiply(y, x, mean->grad, 1.0F);
-    /* dEdvar */
-    _Unsqueeze(mean, h, dim, n);
-    _Sub(input, h, i);
-    _Multiply(a, i, j);
-    _Power(var, k, -1.5F);
-    _ScaleAndShift(k, p, -0.5F);
-    _ReduceSum(j, z, dim);
-    _Multiply(z, p, q);
-    _Multiply(y, q, var->grad, 1.0F);
-    /* dEda */
-    _Multiply(i, e, r);
-    _Multiply(node->grad, r, a->grad, 1.0F);
-    /* dEdb */
-    _Sum(b->grad, node->grad, b->grad);
-    node->visitMark = NODE_FINISHED;
-    delete c;
-    delete d;
-    delete e;
-    delete f;
-    delete g;
-    delete h;
-    delete i;
-    delete j;
-    delete k;
-    delete p;
-    delete q;
-    delete r;
-    delete x;
-    delete y;
-    delete z;
 }
 /*
@@ -1030,6 +962,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 }
 /*
+gradient for Scale
+for
+c = a * scale
+we have
+dE/da = dE/dc * scale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradScale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALE!");
+    XTensor * a = income.tails[0];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad, scale);
+    node->visitMark = NODE_FINISHED;
+}
+/*
+gradient for Descale
+for
+c = a / descale
+we have
+dE/da = dE/dc / descale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for DESCALE!");
+    XTensor * a = income.tails[0];
+    DTYPE descale = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad, 1/descale);
+    node->visitMark = NODE_FINISHED;
+}
+/*
+gradient for Shift
+for
+c = a + shift
+we have
+dE/da = dE/dc
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SHIFT!");
+    XTensor * a = income.tails[0];
+    XNoder::MakeGrad(a);
+    _Sum(a->grad, node->grad, a->grad);
+    node->visitMark = NODE_FINISHED;
+}
+/*
 gradient for minus
 for
 c =  a - b * \beta

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -130,6 +130,18 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);
+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");
    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -127,6 +129,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 }
 /*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    input->grad->Reshape(order1, dimSize1);
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+    input->grad->Reshape(order, dimSize);
+    node->visitMark = NODE_FINISHED;
+}
+/* 
 gradient for merge
 for 
 c = merge(a_0, a_1, ...)
@@ -232,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
    XTensor * last = NULL;
-    XList smalls(income.tailNum);
+    TensorList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -401,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);
+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }
 /* de-constructor */
@@ -79,13 +79,13 @@ backward propagation to obtain gradient
 */
 void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
-    XList golds(1);
+    TensorList golds(1);
    golds.Add(NULL);
-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);
    Backward(roots, golds, paddings, loss);
@@ -99,13 +99,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);
-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);
    Backward(roots, golds, paddings, loss);
@@ -120,13 +120,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);
-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(&padding);
    Backward(roots, golds, paddings, loss);
@@ -138,10 +138,10 @@ with a number of root nodes
 >> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
 {
-    XList golds(roots.count);
+    TensorList golds(roots.count);
-    XList paddings(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
        paddings.Add(NULL);
@@ -157,9 +157,9 @@ with a number of root nodes
 >> golds - a list of gold standard for the output
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
 {
-    XList paddings(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++)
        paddings.Add(NULL);
@@ -174,7 +174,7 @@ with a number of root nodes
 >> paddings - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);
@@ -190,18 +190,18 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
    XLossGrad lossGrad;
    /* we start with the gradient with respect to the loss for output layers */
-    for(int i = 0; i < roots.count; i++){
+    /*for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
-        void * params = income.params;
+        void * params = income.params;*/
        /* we compute dE/dx if the output is generated by an activation function y = f(x).
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
-        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
                XTensor * x = income.tails[0];
                XNoder::MakeGrad(x);
@@ -212,13 +212,13 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
                XNoder::MakeGrad(root);
                lossGrad.Compute(gold, root, root->grad, padding, loss);
            }
-        }
+        }*/
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        else{
+        /*else{
            XNoder::MakeGrad(root);
            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
-    }
+    }*/
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
@@ -266,6 +266,8 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
+        else if(XLossGrad::IsLossOP(node))
+			XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -300,7 +302,7 @@ depth-first search (Tarjan's algorithm)
 */
 void XNet::Traverse(XTensor &root)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);
    Traverse(roots);
@@ -311,7 +313,7 @@ traverse the net and find the topological order by
 depth-first search (Tarjan's algorithm) 
 >> roots - a list of roots (or output nodes)
 */
-void XNet::Traverse(XList &roots)
+void XNet::Traverse(TensorList &roots)
 {
    id = MakeNetID();
    nodes.Clear();
@@ -336,7 +338,7 @@ depth-first search given a node (Tarjan's algorithm for topological ordering)
 >> orders - topological order of the nodes
 >> code - code of the network
 */
-void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+void XNet::TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code)
 {
    if(node == NULL)
        return;
@@ -444,7 +446,7 @@ show network topology
 */
 void XNet::ShowNetwork(FILE * file, XTensor * node)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(node);
    Traverse(roots);
@@ -464,9 +466,9 @@ search for a node in a top-down manner by its name
 >> top - the top most node
 << return - the node we found
 */
-XTensor * XNet::SearchNode(XTensor * top, const char * name)
+//XTensor * XNet::SearchNode(XTensor * top, const char * name)
-{
+//{
-    return XLink::SearchNode(top, name);
+	//return XLink::SearchNode(top, name);
-}
+//}
 }
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@
 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"
 #ifndef __XNET_H__
 #define __XNET_H__
@@ -36,16 +37,16 @@ struct XNet
    unsigned int id;
    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;
    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;
    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;
    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;
    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -70,15 +71,15 @@ struct XNet
    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -92,10 +93,10 @@ struct XNet
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);
    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);
    /* dump network information */
    void Dump(FILE * file);
@@ -113,8 +114,8 @@ struct XNet
    void ShowNetwork(FILE * file, XTensor * node);
    /* search a node in a top-down manner by its name */
-    static
+    //static
-    XTensor * SearchNode(XTensor * top, const char * name);
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };
 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -247,13 +247,13 @@ void Check(FNNModel &model)
 /* make a hard copy of the fnn model */
 void Copy(FNNModel &tgt, FNNModel &src)
 {
-    InitTensor(&tgt.embeddingW, &src.embeddingW);
+    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
+        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
+        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
    }
-    InitTensor(&tgt.outputW, &src.outputW);
+    InitTensorV2(&tgt.outputW, &src.outputW);
-    InitTensor(&tgt.outputB, &src.outputB);
+    InitTensorV2(&tgt.outputB, &src.outputB);
    tgt.n = src.n;
    tgt.eSize = src.eSize;
@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
 */
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
 {
-    InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem);
+    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
 }
 /* 
@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
 */
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
 }
@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the gold standard */
            XTensor gold;
+            /* the loss tensor */
+            XTensor lossTensor;
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                /* forward computation */
                Forward(inputs, output, model, net);
                /* backward computation to obtain gradients */
                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
@@ -483,9 +488,11 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 				/* this is implemented by multiply function */
 				//ForwardAutoDiff(inputs, output, model);
+                lossTensor = CrossEntropy(output, gold);
                /* automatic differentiation */
-                autoDiffer.Backward(output, gold, CROSSENTROPY);
+                autoDiffer.Backward(lossTensor);
+                //autoDiffer.Backward(output, gold, CROSSENTROPY);
                /* update model parameters */
                Update(model, grad, learningRate, true);
@@ -494,7 +501,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* get probabilities */
            float prob = GetProb(output, gold);
-            loss += -prob;
+            prob = ReduceSumAll(lossTensor);
+            loss += prob;
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
@@ -537,8 +546,8 @@ update the model parameters using the delta rule
 */
 void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 {
-    XList paraList(10);
+    TensorList paraList(10);
-    XList gradList(10);
+    TensorList gradList(10);
    paraList.Add(&model.outputW);
    paraList.Add(&model.outputB);
@@ -595,14 +604,14 @@ get prediction probabilites of the gold words
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensor(&probs, &output);
+    InitTensorV2(&probs, &output);
    /* probs[i,j] = output[i,j] * gold[i,j] */
    _Multiply(&output, &gold, &probs);
    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem);
+    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
    _ReduceSum(&probs, &wprobs, 1);
    if(wordProbs != NULL)
        _CopyValues(&wprobs, wordProbs);
@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
    _ReduceSum(&probs, &result, 1);
    return result.Get1D(0);
@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise.
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
                         int itemNum, int devID, XMem * mem)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);
    tensor.SetZeroAll();
@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    int batchSize = -1;
    int n = model.n;
    int depth = model.hDepth;
-    XList eList(n - 1);
+    TensorList eList(n - 1);
    /* previoius n - 1 words */
    for(int i = 0; i < n - 1; i++){
@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        /* make a 2d tensor for the bias term */
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);
        /* introduce bias term:
@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);
        _Sum(&s, &b2D, &s);
@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor dedsHidden;
    XTensor dedxBottom;
    if (depth > 0)
-        InitTensor(&dedsHidden, &dedx);
+        InitTensorV2(&dedsHidden, &dedx);
-    InitTensor(&dedxBottom, &net.embeddingCat);
+    InitTensorV2(&dedxBottom, &net.embeddingCat);
    /* back-propagation from top to bottom in the stack of hidden layers
       for each layer, h = f(s)
@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
            _CopyValues(&dedx, &gradPassed);
    }
-    XList eList(n - 1);
+    TensorList eList(n - 1);
    /* back-propagation for the embedding layer */
    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem);
+        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
        eList.Add(dedy);
    }
@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }
-    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    InitTensor1DV2(&words, size, X_INT, model.devID);
    words.SetData(index, size);
    embeddingBig = Gather(model.embeddingW, words);
@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);
    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
 }
 /*
@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    XTensor hidden;
    XTensor b;
-    XList inputList(n - 1);
+    TensorList inputList(n - 1);
    for(int i = 0; i < n - 1; i++)
        inputList.Add(inputs + i);
@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model)
        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, ngramNum);
+        InitTensor1DV2(&probs, ngramNum);
        /* get probabilities */
        float prob = GetProb(output, gold, &probs);

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    XTensor q2;
    XTensor v2;
    XTensor kqv2;
-    XList split;
+    TensorList split;
    kqv2 = MMul(kqv, wbig);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -85,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv)
    if(isMT)
        decoder->InitModel(argc, argv, true, 0, devID, mem);
-    XList params(10);
+    TensorList params(10);
    GetParams(params);
    for(int i = 0; i < params.count; i++){
@@ -403,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(XList &list)
+void T2TModel::GetParams(TensorList &list)
 {
    list.Clear();
    list.Add(&outputLayer->w);
@@ -465,7 +465,7 @@ void T2TModel::Dump(const char * fn)
    FILE * file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");
-    XList params(100);
+    TensorList params(100);
    GetParams(params);
@@ -489,7 +489,7 @@ void T2TModel::Read(const char * fn)
    FILE * file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot open the model file");
-    XList params(100);
+    TensorList params(100);
    GetParams(params);

--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -98,7 +98,7 @@ public:
                       XTensor &maskDec, XTensor &maskEncDec);
    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);
    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,9 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;
-    output = LogSoftmax(MMul(x, w), -1);
+    //output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
    output.SetName(OUTPUT_NAME);
 }

--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -174,12 +174,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    _SetDataFixedInt(&first, startSymbol);
    /* add a new word into the input sequence of the decoder side */
-    if(inputLast == NULL){
+    if (inputLast == NULL) {
        inputDec = Identity(first);
    }
    else{
        inputDec = GeneratePaths(s);
        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
    }

--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -96,10 +96,10 @@ public:
    /* layers on the encoder side. We actually use the encoder output instead
       of all hidden layers. */
-    XList layersEnc;
+    TensorList layersEnc;
    /* layers on the decoder side */
-    XList layersDec;
+    TensorList layersDec;
    /* list of states */
    T2TState * states;

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -197,12 +197,13 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)
    prob.Reshape(prob.unitNum/outputSize, outputSize);
    score.Reshape(score.unitNum/outputSize, outputSize);
-    probPath.Reshape(score.unitNum/outputSize, outputSize);
+    probPath.Reshape(score.unitNum / outputSize, outputSize);
    probPathPrev.Reshape(probPathPrev.unitNum);
    /* the log-scale probability of the entire sequence */
    _SumDim(&prob, &probPathPrev, &probPath, 0);
    InitTensor(&len, &lenPrev);
    InitTensor(&lp, &lenPrev);
@@ -302,7 +303,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    Mod(index, sizeVocab);
+    _ModMe(index, sizeVocab);
    score.Reshape(order, dims);
@@ -315,18 +316,19 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    InitTensor(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
    CopyValues(index, indexCPU);
-    for(int i = 0; i < indexCPU.unitNum; i++)
+    for (int i = 0; i < indexCPU.unitNum; i++)
        indexCPU.SetInt(i * stride + indexCPU.GetInt(i), i);
    CheckNTErrors(XTensor::IsSameShaped(&prob, &probPath), "Wrong tensor shape!");
-    /* sequence probability and prediction probability of top-k candidates */
+    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
    InitTensor(&probPathTopK, &scoreTopK);
    XTensor probTopK;
    InitTensor(&probTopK, &scoreTopK);
-    for(int i = 0; i < probPath.order; i++){
+    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
        dimsTopK[i] = probPathTopK.GetDim(i);
    }
@@ -342,6 +344,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    probPath.Reshape(order, dims);
    probPathTopK.Reshape(order, dimsTopK);
    prob.Reshape(order, dims);
    probTopK.Reshape(order, dimsTopK);
@@ -396,7 +399,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
       modification of the states. An alternative is to do this on GPUs but 
       it needs much more coding work and the speed-up is not obvious. */
    for(int i = 0; i < beam->stateNum; i += beamSize){
-        for(int j = 0; j < beamSize; j++){
+        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
            T2TState & state = states[k];
@@ -413,7 +416,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
                state.nstep = 0;
                state.isCompleted = false;
            }
-            else{
+            else {
                state.last = last;
                state.pid = state.last->pid;
                state.nstep = last->nstep + 1;
@@ -517,7 +520,7 @@ void T2TSearch::Dump(XTensor * output)
            /* we track the state from the end to the beginning */
            while(state != NULL){
-                if(!state->isCompleted)
+                if (!state->isCompleted)
                    isCompleted = false;
                if (isCompleted)
                    words[count++] = -1;
@@ -589,7 +592,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    mask.SetZeroAll();
    for (int i = 0; i < mask.unitNum; i++) {
-        if(i % beamSize != 0)
+        if (i % beamSize != 0)
            mask.Set(-1e9, i);
    }

--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -154,8 +154,8 @@ void T2TTester::Dump(FILE * file, XTensor * output)
 {
    int seqLength = output->GetDim(-1);
-    for(int i = 0; i < output->unitNum; i += seqLength){
+    for (int i = 0; i < output->unitNum; i += seqLength) {
-        for(int j = 0; j < seqLength; j++){
+        for (int j = 0; j < seqLength; j++) {
            int w = output->GetInt(i + j);
            fprintf(file, "%d ", w);
            if (w < 0)

--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -24,6 +24,7 @@
 #include "T2TUtility.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/core/CHeader.h"
+#include "../../tensor/loss/LHeader.h"
 #include "../../network/XNoder.h"
 #ifndef WIN32
@@ -209,13 +210,16 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
            /* make paddings for the output */
-            if (output.GetDim(0) > 0)
+            //if (output.GetDim(0) > 0)
-                PadOutput(&output, &labelOnehot, &paddingDec);
+                //PadOutput(&output, &labelOnehot, &paddingDec);
            /* get probabilities */
-            float prob = GetProb(&output, &labelOnehot, NULL);
+            //float prob = GetProb(&output, &labelOnehot, NULL);
+            XTensor lossTensor;
+            lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
+            float prob = ReduceSumAll(lossTensor);
-            DTYPE lossLocal = -prob / wc;
+            DTYPE lossLocal = prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
            //XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;  
@@ -223,14 +227,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (doUpdate) {
                /* recale the output for normalized loss */
-                RescaleOutput(&output, &labelOnehot, &paddingDec);
+                //RescaleOutput(&output, &labelOnehot, &paddingDec);
                /* back-propagation */
-                net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
+                net.Backward(lossTensor);
+                //net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
                //net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
                gradStep += 1;
-                loss += -prob;
+                loss += prob;
                wordCount += wc;
                wordCountTotal += wc;
@@ -260,7 +265,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                        elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
+                        elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
@@ -491,7 +496,7 @@ where
 */
 void T2TTrainer::Update(T2TModel * model, const float lr)
 {
-    XList ws(100);
+    TensorList ws(100);
    model->GetParams(ws);
@@ -552,7 +557,7 @@ void T2TTrainer::PrepareModel(T2TModel * model)
    moments.Clear();
    moments2nd.Clear();
-    XList ws(100);
+    TensorList ws(100);
    model->GetParams(ws);

--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -82,10 +82,10 @@ public:
    float adamBeta2T;
    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;
    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;
    /* indicates whether the data file is shuffled for training */
    bool isShuffled;

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,6 +30,7 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./loss/CrossEntropy.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -476,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();
@@ -530,7 +530,7 @@ get device ids for the given device information
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);
    for(int i = 0; i < terms->count; i++){

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -49,7 +49,7 @@ namespace nts {
 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
-    XList list(2);
+    TensorList list(2);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
    MakeLink(&list, h, id);
 }
@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
    if (h == NULL)
        return;
-    XList list(3);
+    TensorList list(3);
-    list.Add(t1);
+    list.Add((XTensor*)t1);
-    list.Add(t2);
+    list.Add((XTensor*)t2);
-    list.Add(t3);
+    list.Add((XTensor*)t3);
    MakeLink(&list, h, id);
 }
@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
    /* forward */
    for(int i = 0; i < list->count; i++){
@@ -528,10 +528,90 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
            CheckNTErrors(hit, "No proper node found in parent.income edge!");
        }
    }
-    strcpy(newOne->name, oldOne->name);
 }
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -544,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);
    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;
-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64
 /*
@@ -144,11 +144,11 @@ struct XLink
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);
    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);
    /* add a parameter */
    static
@@ -174,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);
+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */
-/*
+ /*
  *
- * Implementation of list that keeps data items
+  * Implementation of template list that keeps data items
  *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
+  * $Created by: HU Chi (huchinlp@foxmail.com)
- * The first coding job this year!
  *
  */
-#ifndef __XLIST_H__
-#define __XLIST_H__
 #include "XMem.h"
 #include "XGlobal.h"
-/* the nts (NiuTrans.Tensor) namespace */
+#ifndef __TensorList_H__
-namespace nts{
+#define __TensorList_H__
-typedef int (* ListCompare)(const void * item1, const void * item2);
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
-/* the XList class */
+/* the TensorListBase class */
-class XList
+template <typename T>
-{
+struct TensorListBase {
 public:
    /* data items */
-    void ** items;
+    T *items;
    /* number of items */
    int count;
@@ -49,56 +48,88 @@ public:
    int maxNum;
    /* the memory pool for data array allocation */
-    XMem * mem;
+    XMem* mem;
-    /* indicates whether data items are integers */
-    bool isIntList;
 public:
    /* constructor */
-    XList();
+    TensorListBase();
    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);
    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);
    /* de-constructor */
-    ~XList();
+    ~TensorListBase();
-    /* utilities */
+    /* add an item into the list */
-    void Create(int myMaxNum, XMem * myMem);
+    void Add(T&& item);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
+	/* add an item into the list */
-    void AddList(XList * l);
+	void Add(const T& item);
-    void AddInt(int i);
-    void Insert(int pos, void * item);
+	/* add a number of items into the list */
-    void * GetItem(int i) const;   
+    void Add(T* inputItems, int inputItemCount);
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
+	/* append a list to the current list */
-    void SetItemInt(int i, int item);
+    void AddList(TensorListBase* l);
-    int FindFirst(void * item);
+	/* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+	/* insert an item to the given position of the list */
+	void Insert(int pos, T&& item);
+	/* get the item at position i */
+    T& GetItem(int i) const;
+	/* set the item at position i */
+    void SetItem(int i, const T& item);
+	/* set the item at position i */
+	void SetItem(int i, T&& item);
+	/* find the position of the first matched item  */
+    int FindFirst(const T& item);
+	/* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+	/* sort the list */
+    void Sort(int itemSize);
+	/* reverse the list */
    void Reverse();
+	/* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+	/* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+	/* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);
    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
+	T& operator[] (int i) {
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
+		return GetItem(i);
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
+	};
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
+    T& Get(int i) { return GetItem(i); };
+	void Set(int i, T item) { SetItem(i, item); };
 };
-extern XList NULLList;
+struct XTensor;
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<XTensor*> TensorList;
-} 
+} /* end of the nts (NiuTrans.Tensor) namespace */
-/* end of the nts (NiuTrans.Tensor) namespace */
-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -34,6 +34,11 @@ namespace nts{
 int testxmemid = 0;
 void * recordp = NULL;
+/*
+for managing the memories
+*/
+XMemManager GMems;
 XMem * GMem;
 /* constructor */
@@ -1488,4 +1493,158 @@ cublasHandle_t * XMem::GetCublasHandle()
 #endif
+/* constructor */
+XMemManager::XMemManager()
+{
+    Initialize();
+}
+/* de-constructor */
+XMemManager::~XMemManager()
+{
+}
+/* get memory size */
+MTYPE XMemManager::GetAvailableMemory()
+{
+    unsigned long freeMem = 0;
+#ifndef WIN32
+    long pages = sysconf(_SC_AVPHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    freeMem = pages * page_size;
+#else
+    MEMORYSTATUSEX memoryStatus;
+    memoryStatus.dwLength = sizeof(memoryStatus);
+    if (GlobalMemoryStatusEx(&memoryStatus)){
+        freeMem = memoryStatus.ullAvailPhys;
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+/* get GPU memory size */
+MTYPE XMemManager::GetAvailableGPUMemory(int devID)
+{
+    size_t freeMem = 0;
+    size_t totalMem = 0;
+#ifdef USE_CUDA
+    cudaSetDevice(devID);
+    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
+        XPRINT(0, stderr, "cannot get GPU memory information.");
+        exit(1);
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+/* get buffer size */
+void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
+{
+    *myBufSize = 0;
+    if (freeMem >= MILLION * 128){
+        *myBufSize = MILLION * 32;
+        if (freeMem >= MILLION * 256){
+            *myBufSize = MILLION * 64;
+            if (freeMem >= MILLION * 512){
+                *myBufSize = MILLION * 128;
+                if (freeMem >= MILLION * 1024) {
+                    *myBufSize = MILLION * 256;
+                    if (freeMem >= MILLION * 2048)
+                        *myBufSize = MILLION * 512;
+                }
+            }
+        }
+    }
+} 
+/* initialize it and set the global memory information */
+void XMemManager::Initialize()
+{
+    srand((unsigned int)time(NULL));
+    Free();
+    /* CPUs (we actually do not care about how many CPUs are using) */
+    nCPUMem = 1;
+    MTYPE freeMem = GetAvailableMemory();
+    MTYPE myBufSize = 0;
+    GetBufferSize(freeMem, &myBufSize);
+    CPUMems[0].Initialize(-1, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+    /* GPUs */
+    nGPUMem = 0;
+#ifdef USE_CUDA
+    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
+        XPRINT(0, stderr, "cannot get GPU information.");
+        exit(1);
+    }
+    for (int i = 0; i < nGPUMem; i++) {
+        MTYPE freeMem = GetAvailableGPUMemory(i);
+        MTYPE myBufSize = 0;
+        GetBufferSize(freeMem, &myBufSize);
+        GPUMems[i].Initialize(i, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+    }
+#endif
+}
+/* free it */
+void XMemManager::Free()
+{
+    for (int i = 0; i < MAX_CPU_NUM; i++)
+        CPUMems[i].Free();
+    for (int i = 0; i < MAX_GPU_NUM; i++)
+        GPUMems[i].Free();
+}
+/* get global memory pool */
+XMem * XMemManager::GetMem(const int devID)
+{
+    XMem * mem = NULL;
+    if (devID < 0)
+        mem = CPUMems;
+    else{
+        if (devID < nGPUMem)
+            mem = GPUMems + devID;
+        else
+            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+    }
+    return mem;
+}
+/* get global memory size */
+int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
+{
+    XMem * mem = GetMem(devID);
+    int result = 0;
+    if (mem != NULL){
+        *myBlockSize = mem->maxBlockSize;
+        *myBlockNum = mem->blockNum;
+        *myBufSize = mem->bufSize;
+        result = 1;
+    }
+    return result;
+}
+/* show memory information */
+void XMemManager::ShowMemInfo()
+{
+    XPRINT(1, stderr, "Memory Information:\n");
+    MTYPE myBlockSize, myBufSize;
+    int myBlockNum;
+    for(int i = 0; i < nCPUMem; i++){
+        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%d, blockNum:%d, bufSize:%d\n", myBlockSize, myBlockNum, myBufSize);
+    }
+    for(int i = 0; i < nGPUMem; i++){
+        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%d, blockNum:%d, bufSize:%d\n", i, myBlockSize, myBlockNum, myBufSize);
+    }
+}
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -39,6 +39,12 @@
 #include <curand.h>
 #endif
+#ifndef WIN32
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
@@ -53,6 +59,8 @@ typedef long long          INT_64;
 #define BUF_PITCH 256
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_NUM 16
+#define MAX_GPU_NUM 16
 /* 
 mode of runnig a memory pool 
@@ -413,6 +421,61 @@ public:
 };
+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+public:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_NUM];
+    /* number of cpu memory pools */
+    int nCPUMem;
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_NUM];
+    /* number of gpu memory pools */
+    int nGPUMem;
+public:
+    /* constructor */
+    XMemManager();
+    /* de-constructor */
+    ~XMemManager();
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+    /* initialize it and set the global memory information */
+    void Initialize();
+    /* free it */
+    void Free();
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+    /* show memory information */
+    void ShowMemInfo();
+};
+/* managing the memories */
+extern XMemManager GMems;
 extern XMem * GMem;
 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,12 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
        else if (type == MATH_MULANDSHIFT)
            return "M_OPERATION";
        else if (type == MATH_SIGN)
@@ -111,6 +117,8 @@ const char * GetOPName(int type)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -152,6 +160,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -58,7 +58,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
 #define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
-#define MATH_SIGN               MATH_MULANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
@@ -81,8 +85,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1
-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -108,6 +113,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1
+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);
            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);
            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();
    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);
    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }
 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)
    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+	// warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);
    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }
 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }
 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");

--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;
    /* arguments of the job */
-    XList * args;
+    TensorList * args;
 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;
    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;
    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();
    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);
    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);
    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -255,6 +255,10 @@ public:
    static
    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);
+    /* judge whether b is the reduced shape of a ?? */
+    static
+    bool IsReduceShaped(const XTensor * a, const XTensor * b, int dim);
    /* set the size of each dimension */
    void SetDim(int * myDimSize);
@@ -447,29 +451,57 @@ void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1);
 /* initialize a dense vector */
 void InitTensor1D(XTensor * tensor, const int num, 
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense matrix */
 void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 3d tensor */
 void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 4d tensor */
 void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a dense 5d tensor */
 void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);
+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* initialize a tensor with a reference tensor */
 void InitTensor(XTensor * tensor, const XTensor * reference);
+/* initialize a tensor with a reference tensor */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
 /* initialize a tensor on the CPU with a reference tensor */
 void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
@@ -480,38 +512,72 @@ XTensor * NewTensor();
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const int myDevID = -1);
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
                       const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);
+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID);
 /* generate a dense vector */
 XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
                      XMem * myMem = NULL);
+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
 /* generate a dense matrix */
 XTensor * NewTensor2D(const int rowNum, const int colNum, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 3d tensor */
 XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 4d tensor */
 XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a dense 5d tensor */
 XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);
+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
 /* generate a copy of XTensor (with a reference to a given tensor) */
 XTensor * NewTensor(const XTensor * a, bool isFilledData = true);

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{
 #endif
-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);
 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:
    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;
    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -97,4 +97,5 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"
+#include "../function/DropoutWithIndex.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -218,4 +218,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
    return c;
 }
+/*
+element-wise division of two tensors
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetDivDimIndex(a, b);
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+        /* call _Div function */
+        _Div(&a, &b, &c, 0, leadingDim);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+/*
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the element
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -171,4 +171,35 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    return c;
 }
+/*
+tensor division
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+>> requireLink - if add operation to network
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -53,6 +53,14 @@ we make a new tensor c to keep the result and return it
 */
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);
+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __DIVDIM_H__
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Mask.h"
+#include "Mask.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
+    //    "Unmatched tensors in addition!");
+    if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && mask->devID >= 0) ||
+                (a->devID >= 0 && mask->devID < 0) ||
+                (a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaMask(a, mask, c, alpha);
+        }
+        else
+            _CudaMask(a, mask, c, alpha);
+#endif
+    }
+    else {
+        if (!a->isSparse && !mask->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+            if (a->dataType == DEFAULT_DTYPE &&
+                mask->dataType == X_INT &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                int * maskp = (int*)mask->data;
+                DTYPE * cp = (DTYPE*)c->data;
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                        if (maskp[i + 1] == 0) {
+                            cp[i + 1] = alpha;
+                        }
+                        else {
+                            cp[i + 1] = ap[i + 1];
+                        }
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
+{
+    _Mask(a, mask, a, alpha);
+}
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+    /* call _Sum function */
+    _Mask(&a, &mask, &c, alpha);
+    /* tensor connections */
+    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    //XLink::AddParamToHead(&c, alpha);
+    // TODO!!
+    ShowNTErrors("TODO!");
+    return c;
+}
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/*
+mask entries of a given tensor (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> mask - mask matrix
+>> c - where we put masked a
+>> size - the size of a/b/c
+>> alpha - value
+*/
+__global__
+    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size) {
+        if (mask[i] == 0) {
+            c[i] = alpha;
+        }
+        else {
+            c[i] = a[i];
+        }
+    }
+}
+/*
+mask entries of a given tensor (cuda version)
+>> a - a tensor
+>> mask - mask tensor
+>> c - where we put masked a
+>> alpha - value 
+*/
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
+    //    "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
+        "The tensors must be on the same!");
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+    if (!a->isSparse && !mask->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+        if (a->dataType == DEFAULT_DTYPE &&
+            mask->dataType == X_INT &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+    XDevice::SetGPUDevice(devIDBackup);
+}
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+#ifndef __MASK_CUH__
+#define __MASK_CUH__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#ifdef USE_CUDA
+/* mask entries of a given tensor (cuda version) */
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
+#endif // USE_CUDA
+} // namespace nts(NiuTrans.Tensor)
+#endif // __MASK_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -108,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cBlockNum *= b->dimSizeRDI[i];
    }
-    XList * aList = new XList(10);
+    TensorList * aList = new TensorList(10);
-    XList * bList = new XList(10);
+    TensorList * bList = new TensorList(10);
-    XList * cList = new XList(10);
+    TensorList * cList = new TensorList(10);
    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
@@ -202,6 +202,42 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }
+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+{
+    if (!(a && b && c))
+        return false;
+    if(!(a->dataType == b->dataType && a->dataType == c->dataType))
+        return false;
+    if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
+        return false;
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+    int order = a->order + b->order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a->order; i++)
+        dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
+    for (int i = 2; i < b->order; i++)
+        dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != c->dimSize[i])
+            return false;
+    }
+    return true;
+}
 /*
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
@@ -266,6 +302,53 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    return c;
 }
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
+    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+    if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
+        int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+        int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+        int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+        int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _MatrixMul function */
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
 /* 
 matrix multiplication with no transposition c = a * b * alpha
 >> a - tensor a
@@ -316,6 +399,52 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    return c;
 }
+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+    if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
+        int an = a.dimSizeRDI[1];
+        int am = a.dimSizeRDI[0];
+        int bn = b.dimSizeRDI[1];
+        int bm = b.dimSizeRDI[0];
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _MatrixMul function */
+    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -59,10 +59,16 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
 XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
+    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,17 +38,23 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void _MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(TensorList * args)
 {
-    int x1 = *(int*)args->GetItem(0);
+	CheckNTErrors(args->count == 2, "invalid argument number!");
-    int y1 = *(int*)args->GetItem(1);
+	IntList * indexArgs = (IntList*)args->GetItem(0);
-    int x2 = *(int*)args->GetItem(2);
+	TensorList * matrixArgs = (TensorList*)args->GetItem(1);
-    int y2 = *(int*)args->GetItem(3);
+	CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
-    XTensor * a = (XTensor*)args->GetItem(4);
+	CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
-    XTensor * b = (XTensor*)args->GetItem(5);
-    XTensor * c = (XTensor*)args->GetItem(6);
+    XTensor * a = matrixArgs->GetItem(0);
-    DTYPE alpha = *(DTYPE*)args->GetItem(7);
+    XTensor * b = matrixArgs->GetItem(1);
-    DTYPE beta = *(DTYPE*)args->GetItem(8);
+    XTensor * c = matrixArgs->GetItem(2);
+    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
+    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
+	int x1 = indexArgs->GetItem(0);
+	int y1 = indexArgs->GetItem(1);
+	int x2 = indexArgs->GetItem(2);
+	int y2 = indexArgs->GetItem(3);
 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-void _MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(TensorList * args);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          const TensorList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
+                          TensorList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty input lists!");
    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
 optimized for GPU
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+                          TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 /*
 matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -117,7 +117,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
        ShowNTErrors("Something is wrong!");
    }
    /* tensor connections */
    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
    XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -219,4 +219,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    return c;
 }
+/*
+element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the index of the item
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetMultiplyDimIndex(a, b);
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+        /* call _Multiply function */
+        _Multiply(&a, &b, &c, 0, leadingDim);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);
+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -170,6 +170,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
 }
 /*
+tensor multiplication
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> requireLink - if add operation to network
+*/
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, 0);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
+/* 
 tensor broadcast multiplication
 c = a * b + c * \beta 
 where some of dimensions of b can be of size 1
@@ -309,4 +339,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
    return c;
 }
+/* 
+tensor broadcast multiplication
+c = a * b + c * \beta 
+where some of dimensions of b can be of size 1
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> requireLink - if add operation to network
+*/
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumBroadcast function */
+    _MultiplyBroadcast(&a, &b, &c, 0);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);
+/* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting */
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
   we return the resulting tensor here */
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);
+/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MULTIPLYDIM_H__
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a)
    return b;
 }
+/*
+set every entry to its minus value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Negate(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Negate function */
+    _Negate(&a, &b);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Negate(const XTensor & a);
+/* set every entry to its minus value */
+void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a)
    return b;
 }
+/*
+set every entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Sign(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Sign function */
+    _Sign(&a, &b);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Sign(const XTensor & a);
+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -196,4 +196,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/*
+tensor subtraction c = a - b * \beta
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetSubDimIndex(a, b);
+    if (n == -1) {
+        /* call _Sub function */
+        _Sub(&a, &b, &c, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUB);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SubDim function */
+        _SubDim(&a, &b, &c, n, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor subtraction c = a - b * \beta */
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUB_H__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -171,4 +171,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 	return c;
 }
+/*
+tensor subtraction
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _Sub function */
+    _SubDim(&a, &b, &c, n, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -201,4 +201,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/*
+tensor summation c = a + b * \beta
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    int n = GetSumDimIndex(a, b);
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SumDim function */
+        _SumDim(&a, &b, &c, n, beta);
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -41,6 +41,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor summation c = a + b * \beta */
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -189,6 +189,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 }
 /*
+tensor summation 
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumDim function */
+    _SumDim(&a, &b, &c, n, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+/* 
 tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
 c = a + b * \beta
@@ -329,4 +360,30 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }
+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+    /* call _SumBroadcast function */
+    _SumBroadcast(&a, &b, &c, beta);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
 }
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -42,6 +42,10 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);
+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting */
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -49,6 +53,9 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
   we return the resulting tensor here */
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
+/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUMDIM_H__
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -201,9 +201,9 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULList(cublasHandle_t * handle,
-                            const XList * a, MATRIX_TRANS_TYPE transposedA,
+                            const TensorList * a, MATRIX_TRANS_TYPE transposedA,
-                            const XList * b, MATRIX_TRANS_TYPE transposedB,
+                            const TensorList * b, MATRIX_TRANS_TYPE transposedB,
-                            XList * c,
+                            TensorList * c,
                            int count, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
-void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const TensorList * a, MATRIX_TRANS_TYPE transposedA, 
-                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+                            const TensorList * b, MATRIX_TRANS_TYPE transposedB, TensorList * c,
                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 #endif

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -111,9 +111,10 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    onehot->SetZeroAll();
-#ifdef USE_CUDA
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
+#ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
@@ -129,8 +130,7 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
        DTYPE * od = onehotData + i * stride;
-        od[id] = 2;
+        od[id] = 1;
-        //onehotData[i * stride + id] = 1;
    }
 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -36,18 +36,36 @@ int cudascale(int x, int scale)
 }
 __device__
+float cudascale(float x, float scale)
+{
+    return x * scale;
+}
+__device__
 int cudadescale(int x, int descale)
 {
    return x / descale;
 }
 __device__
+float cudadescale(float x, float descale)
+{
+    return x / descale;
+}
+__device__
 int cudashift(int x, int shift)
 {
    return x + shift;
 }
 __device__
+float cudashift(float x, float descale)
+{
+    return x + descale;
+}
+__device__
 int cudamod(int x, int mod)
 {
    return x % mod;
@@ -92,9 +110,51 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num)               \
    BacktoCudaDev(a->devID, devIDBackup);                                   \
 }                                                                           \
+#define SIMPLE_BINARY_FUNCTION_FLOAT_GPU(funcName, origFunc)                \
+__global__                                                                  \
+void Kernel##funcName(float * a, float * b, int size, float num)            \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        b[i] = (float)origFunc(a[i], num);                                  \
+}                                                                           \
+                                                                            \
+                                                                            \
+void _Cuda##funcName(const XTensor * a, XTensor * b, float num)             \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == X_FLOAT) {                                           \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                        ((float*)a->data, (float*)b->data, a->unitNum, num);\
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}
 SIMPLE_BINARY_FUNCTION_GPU(Scale, cudascale)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ScaleFloat, cudascale)
 SIMPLE_BINARY_FUNCTION_GPU(Descale, cudadescale)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(DescaleFloat, cudadescale)
 SIMPLE_BINARY_FUNCTION_GPU(Shift, cudashift)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ShiftFloat, cudashift)
 SIMPLE_BINARY_FUNCTION_GPU(Mod, cudamod)
 #endif // USE_CUDA

--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
@@ -32,20 +32,29 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* scale each entry (CUDA Kernel) */
 __global__
 void KernelScale(int * a, int * b, int size, int scale);
+__global__
+void KernelScale(int * a, int * b, int size, float scale);
 /* scale each entry */
 void _CudaScale(const XTensor * a, XTensor * b, int scale);
+void _CudaScaleFloat(const XTensor * a, XTensor * b, float scale);
 /* descale each entry (CUDA Kernel) */
 __global__
 void KernelDescale(int * a, int * b, int size, int scale);
+__global__
+void KernelDescale(int * a, int * b, int size, float scale);
 /* descale each entry */
 void _CudaDescale(const XTensor * a, XTensor * b, int scale);
+void _CudaDescaleFloat(const XTensor * a, XTensor * b, float scale);
 /* shift each entry (CUDA Kernel) */
 __global__
 void KernelShift(int * a, int * b, int size, int shift);
+__global__
+void KernelShift(int * a, int * b, int size, float shift);
 /* shift each entry */
 void _CudaShift(const XTensor * a, XTensor * b, int shift);
+void _CudaShiftFloat(const XTensor * a, XTensor * b, float shift);
 /* mod each entry (CUDA Kernel) */
 __global__

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -37,51 +37,76 @@ void _Scale(const XTensor * a, XTensor * b, float scale);
 scale up tensor entires (on site)
 b = a * scale
 */
-void Scale(XTensor & a, int scale);
+void _ScaleMe(XTensor & a, int scale);
-void Scale(XTensor & a, float scale);
+void _ScaleMe(XTensor & a, float scale);
 /*
 scale up tensor entires
 b = a * scale
 */
 void Scale(const XTensor & a, XTensor &b, int scale);
-void Scale(const XTensor & a, XTensor &b, float scale);
+void Scale(const XTensor & a, XTensor &b, float scale, bool requireLink = false);
+/*
+scale up tensor entires (return an XTensor structure)
+b = a * scale
+*/
+XTensor Scale(const XTensor & a, float scale);
 /*
 descale tensor entires
 b = a / scale
 */
 void _Descale(const XTensor * a, XTensor * b, int scale);
+void _Descale(const XTensor * a, XTensor * b, float scale);
 /*
 descale tensor entires (on site)
 b = a / scale
 */
-void Descale(XTensor & a, int scale);
+void _DescaleMe(XTensor & a, int scale);
+void _DescaleMe(XTensor & a, float scale);
 /*
 descale tensor entires
 b = a / scale
 */
 void Descale(const XTensor & a, XTensor & b, int scale);
+void Descale(const XTensor & a, XTensor & b, float scale, bool requireLink = false);
+/*
+descale tensor entires (return an XTensor structure)
+b = a / scale
+*/
+XTensor Descale(const XTensor & a, float scale);
 /*
 shift tensor entires
 b = a + shift
 */
 void _Shift(const XTensor * a, XTensor * b, int shift);
+void _Shift(const XTensor * a, XTensor * b, float shift);
 /*
 shift tensor entires (on site)
 b = a + shift
 */
-void Shift(XTensor & a, int shift);
+void _ShiftMe(XTensor & a, int shift);
+void _ShiftMe(XTensor & a, float shift);
 /*
 shift tensor entires
 b = a + shift
 */
 void Shift(const XTensor & a, XTensor & b, int shift);
+void Shift(const XTensor & a, XTensor & b, float shift, bool requireLink = false);
+/*
+shift tensor entires (return an XTensor structure)
+b = a + shift
+*/
+XTensor Shift(const XTensor & a, float shift);
 /*
 mod tensor entires
@@ -93,7 +118,7 @@ void _Mod(const XTensor * a, XTensor * b, int base);
 mod tensor entires (on site)
 b = a % mod
 */
-void Mod(XTensor & a, int base);
+void _ModMe(XTensor & a, int base);
 /*
 mod tensor entires

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -94,6 +94,23 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 	return b;
 }
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Clip function */
+    _Clip(&a, &b, lower, upper);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+        XLink::AddParamToHead(&b, lower);
+        XLink::AddParamToHead(&b, upper);
+    }
+}
 /*
 backward computation

--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -37,6 +37,8 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);
+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink = false);
 /*
 backward of Clip function
 */

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -138,12 +138,12 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
    /* tensor connections */
-    XList list(5);
+    TensorList list(5);
-    list.Add(&input);
+    list.Add((XTensor*)&input);
-    list.Add(&mean);
+    list.Add((XTensor*)&mean);
-    list.Add(&var);
+    list.Add((XTensor*)&var);
-    list.Add(&a);
+    list.Add((XTensor*)&a);
-    list.Add(&b);
+    list.Add((XTensor*)&b);
    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, epsilon);

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -102,4 +102,27 @@ XTensor Power(const XTensor & a, DTYPE p)
    return b;
 }
+/*
+get the power(a, p)
+>> a - input tensor
+>> b - output tensor
+>> p - parameter
+>> requireLink - if add operation to network
+*/
+void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _Power function */
+    _Power(&a, &b, p);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_POWER);
+        XLink::AddParamToHead(&b, p);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Power(const XTensor & a, DTYPE p);
+/* get the power(x, y) */
+void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __POWER_H__
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -118,4 +118,33 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
    return b;
 }
+/* 
+scale and shift all tensor entires
+b = a * scale + shift
+>> a - the input tensor
+>> b - the output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+>> requireLink - if add operation to network
+*/
+void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+    /* call _ScaleAndShift function */
+    _ScaleAndShift(&a, &b, scale, shift);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
+        XLink::AddParamToHead(&b, scale);
+        XLink::AddParamToHead(&b, shift);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -50,6 +50,12 @@ b = a * scale + shift
 */
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
+/* 
+scale and shift all tensor entires 
+b = a * scale + shift 
+*/
+void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SCALEANDSHIFT_H__
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -82,58 +82,82 @@ XTensor funcName(const XTensor &a)                                          \
    return b;                                                               \
 }
+#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
+void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b);                                                      \
+    if (requireLink) {                                                      \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}
 _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
 _SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
 _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
 _SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
 _SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
 _SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
 _SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
+SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
 _SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
 #else
 /* define three marco separately, specify the respective function names (CPU mode) */
@@ -164,59 +188,82 @@ XTensor funcName(const XTensor &a)                                          \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
 }
+#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
+void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b);                                                      \
+    if (requireLink) {                                                      \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}
 _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)
 _SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)
 _SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)
 _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)
 _SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)
 _SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)
 _SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)
 _SIMPLE_UNARY_FUNCTION(_Round, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)
 _SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)
 _SIMPLE_UNARY_FUNCTION(_Square, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
+SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)
 _SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)
 _SIMPLE_UNARY_FUNCTION(_Cos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)
 _SIMPLE_UNARY_FUNCTION(_Tan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)
 /*_SIMPLE_UNARY_FUNCTION(_Round, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -34,6 +34,8 @@ void _AbsoluteMe(XTensor * a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
+/* set every entry to its absolute value */
+void Absolute(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its ceil value */
 void _Ceil(const XTensor * a, XTensor * b);
@@ -43,6 +45,8 @@ void _CeilMe(XTensor * a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
+/* set every entry to its ceil value */
+void Ceil(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
@@ -52,6 +56,8 @@ void _ExpMe(XTensor * a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
+/* set every entry to its exponent value */
+void Exp(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its floor value */
 void _Floor(const XTensor * a, XTensor * b);
@@ -61,6 +67,8 @@ void _FloorMe(XTensor * a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
+/* set every entry to its floor value */
+void Floor(const XTensor & a, XTensor & b, bool requireLink = false);
 /* if source entry is non-zero, set target entry to be one, otherwise zero */
 void _IsNonZero(const XTensor *a, XTensor *b);
@@ -70,6 +78,8 @@ void _IsNonZeroMe(XTensor *a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void IsNonZero(const XTensor &a, XTensor & b, bool requireLink = false);
 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
@@ -79,6 +89,8 @@ void _IsZeroMe(XTensor *a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
+/* if source entry is zero, set target entry to be one, otherwise zero */
+void IsZero(const XTensor &a, XTensor & b, bool requireLink = false);
 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
@@ -88,6 +100,8 @@ void _LogMe(XTensor * a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
+/* set every entry to its logarithm value */
+void Log(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its round value */
 void _Round(const XTensor * a, XTensor * b);
@@ -97,6 +111,8 @@ void _RoundMe(XTensor * a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
+/* set every entry to its round value */
+void Round(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its sqrt value */
 void _Sqrt(const XTensor * a, XTensor * b);
@@ -106,6 +122,8 @@ void _SqrtMe(XTensor * a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
+/* set every entry to its sqrt value */
+void Sqrt(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its square value */
 void _Square(const XTensor * a, XTensor * b);
@@ -115,6 +133,8 @@ void _SquareMe(XTensor * a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
+/* set every entry to its square value */
+void Square(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its sine value */
@@ -125,6 +145,8 @@ void _SinMe(XTensor * a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
+/* set every entry to its sine value */
+void Sin(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
@@ -134,6 +156,8 @@ void _CosMe(XTensor * a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
+/* set every entry to its cosine value */
+void Cos(const XTensor & a, XTensor & b, bool requireLink = false);
 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
@@ -143,6 +167,8 @@ void _TanMe(XTensor * a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);
+/* set every entry to its tangent value */
+void Tan(const XTensor & a, XTensor & b, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -229,10 +229,10 @@ XTensor CopyIndexed(const XTensor & s, int dim,
    /* call _CopyIndexed function */
    _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);
-    XList list(3);
+    TensorList list(3);
-    list.Add(&s);
+    list.Add((XTensor*)&s);
-    list.Add(&srcIndex);
+    list.Add((XTensor*)&srcIndex);
-    list.Add(&tgtIndex);
+    list.Add((XTensor*)&tgtIndex);
    /* tensor connection */
    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -131,4 +131,43 @@ XTensor ReduceMax(const XTensor &input, int dim)
    return output;
 }
+/* 
+get the max value of the items along a dimension of the tensor
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> requireLink - if add operation to network
+*/
+void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _ReduceMax function */
+    _ReduceMax(&input, &output, dim);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -35,6 +35,9 @@ make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);
+/* get the max value of the items along a dimension of the tensor. */
+void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __REDUCEMAX_H__
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -86,4 +86,45 @@ XTensor ReduceMean(const XTensor &input, int dim)
    return output;
 }
+/* 
+get the mean value along a dimension of the tensor
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> requireLink - if add operation to network
+*/
+void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _ReduceMean function */
+    _ReduceMean(&input, &output, dim);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -39,6 +39,12 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
 XTensor ReduceMean(const XTensor &input, int dim);
+/* 
+get the mean value along a dimension of the tensor
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+*/
+void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __REDUCEMEAN_H__
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -244,6 +244,39 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    return output;
 }
+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, &shift, power, isExp);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }
+}
 /* 
 sum the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -290,4 +323,52 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
    return output;
 }
+/* 
+sum the items along a dimension of the tensor
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift)^power if isExp == false
+sum = \sum_i exp((a_i - shift)^power) if isExp == true
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> shift - shift the input
+>> ieExp - specify if the exp() is performed
+>> power - we perform pow(item_i, power) on each item in the array
+>> requireLink - if add operation to network
+*/
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, NULL, power, isExp);
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp