Merge branch 'liyinqiao' into xiaotong-working

# Conflicts: # source/tensor/XDevice.cpp # source/tensor/XMem.cpp # source/tensor/XTensor.cpp

Merge branch 'liyinqiao' into xiaotong-working
# Conflicts: # source/tensor/XDevice.cpp # source/tensor/XMem.cpp # source/tensor/XTensor.cpp
314f4370 · liyinqiao · 7c17670d · 58181c8d · 314f4370 · 314f4370
Commit 314f4370 authored Jul 16, 2019 by liyinqiao
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -20,7 +20,9 @@
 */

 #include "XBackwardLoss.h"
+#include "XNoder.h"
 #include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
 #include "../tensor/core/getandset/SetData.h"
 #include "../tensor/function/HardTanH.h"
 #include "../tensor/function/Identity.h"
@@ -31,6 +33,60 @@

 namespace nts{

+
+/* compute dE/dx of a node */
+void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    CheckNTErrors(income.tailNum >= 1, "Wrong number of tensors for loss computation!");
+
+    XTensor * output = income.tails[0];
+    XTensor * gold = NULL;
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+    int leadingDim;
+
+    XNoder::MakeGrad(output);
+    XTensor * dedy = output->grad;
+
+    if (income.tailNum == 1) {
+        if(dedy->dataType == X_FLOAT)
+            _SetDataFixedFloat(dedy, 1.0F);
+        else if(dedy->dataType == X_DOUBLE)
+            _SetDataFixedDouble(dedy, 1.0);
+        else if(dedy->dataType == X_INT)
+            _SetDataFixedInt(dedy, 1);
+        else
+            ShowNTErrors("TODO");
+
+        return;
+    }
+
+    gold = income.tails[1];
+
+    if(operID == LOSS_CROSSENTROPY) {
+        if (income.tailNum == 3) 
+            padding = income.tails[2];
+         leadingDim = income.GetParamInt(0);
+        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
+        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* indicates whether the node is for a loss computation */
+bool XLossGrad::IsLossOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & LOSS_BASE) != 0;
+}
+
 /* 
 compute dE/dx for a given function y = f(x) 
 >> gold - gold standard to measure error (or loss)

--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XBACKWARDLOSS_H__
 #define __XBACKWARDLOSS_H__
@@ -34,6 +35,14 @@ namespace nts{
 class XLossGrad
 {
 public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficient);
+
+    /* indicates whether the node is for a Loss computation */
+    static
+    bool IsLossOP(XTensor * node);
+
    /* compute dE/dx for a given function y = f(x) */
    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
                 XTensor * dedy, XTensor * dedx, XTensor * padding,

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -81,6 +81,12 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
        GradPower(node, isEfficient);
    else if(operID == MATH_SCALEANDSHIFT)
        GradScaleAndShift(node, isEfficient);
+    else if(operID == MATH_SCALE)
+        GradScale(node, isEfficient);
+    else if(operID == MATH_DESCALE)
+        GradDescale(node, isEfficient);
+    else if(operID == MATH_SHIFT)
+        GradShift(node, isEfficient);
    else if(operID == MATH_SUB)
        GradSub(node, isEfficient);
    else if(operID == MATH_SUBDIM)
@@ -719,12 +725,18 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0]; 
    XTensor * b = income.tails[1];
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);

    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
+
+    if (!isEfficient || a->isGrad) {
+        XNoder::MakeGrad(a);
        _Multiply(node->grad, b, a->grad, 1.0F);
+    }
+
+    if (!isEfficient || b->isGrad) {
+        XNoder::MakeGrad(b);
        _Multiply(node->grad, a, b->grad, 1.0F);
+    }

    node->visitMark = NODE_FINISHED;
 }
@@ -888,88 +900,8 @@ gradient for normalize
 */
 void XMathGrad::GradNormalize(XTensor * node, bool isEfficient)
 {
-    ShowNTErrors("This is really a bad piece of code!!!");
-    
-    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 5, "Wrong input tensor number for NORMALIZE!");
-
-    XTensor * input = income.tails[0];
-    XTensor * mean = income.tails[1];
-    XTensor * var = income.tails[2];
-    XTensor * a = income.tails[3];
-    XTensor * b = income.tails[4];
-    XTensor * c = NewTensor(var);
-    XTensor * d = NewTensor(a);
-    XTensor * e = NewTensor(a);
-    XTensor * f = NewTensor(a);
-    XTensor * g = NewTensor(a);
-    XTensor * h = NewTensor(a);
-    XTensor * i = NewTensor(a);
-    XTensor * j = NewTensor(a);
-    XTensor * k = NewTensor(var);
-    XTensor * p = NewTensor(var);
-    XTensor * q = NewTensor(var);
-    XTensor * r = NewTensor(a);
-    XTensor * x = NewTensor(mean);
-    XTensor * y = NewTensor(mean);
-    XTensor * z = NewTensor(mean);
-    DTYPE epsilon = income.GetParam(1);
-
-    int dim = income.GetParamInt(0);
-    int n = a->GetDim(dim);
-    XNoder::MakeGrad(input);
-    XNoder::MakeGrad(mean);
-    XNoder::MakeGrad(var);
-    XNoder::MakeGrad(a);
-    XNoder::MakeGrad(b);
-
-    /* dEdinput */
-    _ScaleAndShift(var, c, 1.0F, epsilon);
-    _Unsqueeze(c, d, dim, n);
-    _Power(d, e, -0.5F);
-    _Multiply(a, e, f);
-    _Multiply(node->grad, f, input->grad, 1.0F);
-
-    /* dEdmean */
-    _ScaleAndShift(f, g, -1.0F);
-    _ReduceSum(g, x, dim);
-    _ReduceSum(node->grad, y, dim);
-    _Multiply(y, x, mean->grad, 1.0F);
-
-    /* dEdvar */
-    _Unsqueeze(mean, h, dim, n);
-    _Sub(input, h, i);
-    _Multiply(a, i, j);
-    _Power(var, k, -1.5F);
-    _ScaleAndShift(k, p, -0.5F);
-    _ReduceSum(j, z, dim);
-    _Multiply(z, p, q);
-    _Multiply(y, q, var->grad, 1.0F);
-
-    /* dEda */
-    _Multiply(i, e, r);
-    _Multiply(node->grad, r, a->grad, 1.0F);
-
-    /* dEdb */
-    _Sum(b->grad, node->grad, b->grad);
-
-    node->visitMark = NODE_FINISHED;
+    ShowNTErrors("TODO!");
    
-    delete c;
-    delete d;
-    delete e;
-    delete f;
-    delete g;
-    delete h;
-    delete i;
-    delete j;
-    delete k;
-    delete p;
-    delete q;
-    delete r;
-    delete x;
-    delete y;
-    delete z;
 }

 /*
@@ -1030,6 +962,82 @@ void XMathGrad::GradScaleAndShift(XTensor * node, bool isEfficient)
 }

 /*
+gradient for Scale
+for
+c = a * scale
+we have
+dE/da = dE/dc * scale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradScale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SCALE!");
+
+    XTensor * a = income.tails[0];
+
+    DTYPE scale = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad, scale);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for Descale
+for
+c = a / descale
+we have
+dE/da = dE/dc / descale
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradDescale(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for DESCALE!");
+
+    XTensor * a = income.tails[0];
+
+    DTYPE descale = income.GetParam(0);
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad, 1/descale);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
+gradient for Shift
+for
+c = a + shift
+we have
+dE/da = dE/dc
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+an efficient manner
+*/
+void XMathGrad::GradShift(XTensor * node, bool isEfficient)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for SHIFT!");
+
+    XTensor * a = income.tails[0];
+
+    XNoder::MakeGrad(a);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/*
 gradient for minus
 for
 c =  a - b * \beta

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -130,6 +130,18 @@ private:
    static
    void GradScaleAndShift(XTensor * node, bool isEfficient);

+    /* gradient for Scale */
+    static
+    void GradScale(XTensor * node, bool isEfficient);
+
+    /* gradient for Shift */
+    static
+    void GradShift(XTensor * node, bool isEfficient);
+
+    /* gradient for Descale */
+    static
+    void GradDescale(XTensor * node, bool isEfficient);
+
    /* gradient for Minus */
    static
    void GradSub(XTensor * node, bool isEfficient);

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -43,6 +43,8 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
        GradCopyIndexed(node, isEfficent);
    else if(operID == MOVEMENT_GATHER)
        GradGather(node, isEfficent);
+    else if (operID == MOVEMENT_DROPOUTWITHINDEX)
+        GradDropoutWithIndex(node, isEfficent);
    else if(operID == SHAPE_MERGE)
        GradMerge(node, isEfficent);
    else if(operID == SHAPE_MERGE_LIST)
@@ -115,7 +117,7 @@ dE/da = spreadforgather(b)
 void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for CopyIndexed!");
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for Gather!");

    XTensor * input = income.tails[0];
    XTensor * index = income.tails[1];
@@ -127,6 +129,43 @@ void XShapeGrad::GradGather(XTensor * node, bool isEfficent)
 }

 /*
+gradient computation for DropoutWithIndex function
+*/
+void XShapeGrad::GradDropoutWithIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for DropoutWithIndex!");
+
+    XTensor * input = income.tails[0];
+    XTensor * index = income.tails[1];
+    DTYPE scale = income.GetParam(0);
+    XNoder::MakeGrad(input);
+
+    //_Identity(node->grad, input->grad);
+    _CopyValues(node->grad, input->grad);
+
+    int order = node->grad->order;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = node->grad->dimSize[i];
+    }
+
+    int order1 = 1;
+    int * dimSize1 = new int[order1];
+    dimSize1[0] = input->grad->unitNum;
+    
+    input->grad->Reshape(order1, dimSize1);
+
+    _DropoutWithIndex(node->grad, index, input->grad);
+    _ScaleAndShiftMe(input->grad, scale);
+
+    input->grad->Reshape(order, dimSize);
+
+    node->visitMark = NODE_FINISHED;
+}
+
+/* 
 gradient for merge
 for 
 c = merge(a_0, a_1, ...)
@@ -232,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");

    XTensor * last = NULL;
-    XList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smalls(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -401,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;


--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,6 +54,10 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);

+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
+    static
+    void GradDropoutWithIndex(XTensor * node, bool isEfficent);
+
    /* gradient computation for merge: c = merge(a, b, ...) */
    static
    void GradMerge(XTensor * node, bool isEfficent);

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -55,7 +55,7 @@ void XNetClearAll()
 XNet::XNet()
 {
    nodes.Clear();
-    isGradEfficient = true;
+    isGradEfficient = false;
 }

 /* de-constructor */
@@ -79,13 +79,13 @@ backward propagation to obtain gradient
 */
 void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(NULL);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);

    Backward(roots, golds, paddings, loss);
@@ -99,13 +99,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(NULL);

    Backward(roots, golds, paddings, loss);
@@ -120,13 +120,13 @@ backward propagation to obtain gradient wrt. the loss/error function
 */
 void XNet::Backward(XTensor &root, XTensor &gold, XTensor &padding, LOSS_FUNCTION_NAME loss)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

-    XList golds(1);
+    TensorList golds(1);
    golds.Add(&gold);

-    XList paddings(1);
+    TensorList paddings(1);
    paddings.Add(&padding);

    Backward(roots, golds, paddings, loss);
@@ -138,10 +138,10 @@ with a number of root nodes
 >> roots - a list of root nodes (output) of the network
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, LOSS_FUNCTION_NAME loss)
 {
-    XList golds(roots.count);
-    XList paddings(roots.count);
+    TensorList golds(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++) {
        golds.Add(NULL);
        paddings.Add(NULL);
@@ -157,9 +157,9 @@ with a number of root nodes
 >> golds - a list of gold standard for the output
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss)
 {
-    XList paddings(roots.count);
+    TensorList paddings(roots.count);
    for (int i = 0; i < roots.count; i++)
        paddings.Add(NULL);

@@ -174,7 +174,7 @@ with a number of root nodes
 >> paddings - specify a target value that is ignored
 >> loss - name of loss function
 */
-void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss)
+void XNet::Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);

@@ -190,18 +190,18 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
    XLossGrad lossGrad;

    /* we start with the gradient with respect to the loss for output layers */
-    for(int i = 0; i < roots.count; i++){
+    /*for(int i = 0; i < roots.count; i++){
        XTensor * root = (XTensor*)roots.Get(i);
        XTensor * gold = (XTensor*)golds.Get(i);
        XTensor * padding = (XTensor*)paddings.Get(i);
        XLink &income = root->income;
        int funcID = income.typeID;
-        void * params = income.params;
+        void * params = income.params;*/

        /* we compute dE/dx if the output is generated by an activation function y = f(x).
           Note that we do not need to obtain dE/dy here because it is no use in the 
           folloing process of back-propagation */
-        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+        /*if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
            if(funcID == FUNC_LOGSOFTMAX || funcID == FUNC_SOFTMAX) {
                XTensor * x = income.tails[0];
                XNoder::MakeGrad(x);
@@ -212,13 +212,13 @@ void XNet::Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_N
                XNoder::MakeGrad(root);
                lossGrad.Compute(gold, root, root->grad, padding, loss);
            }
-        }
+        }*/
        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
-        else{
+        /*else{
            XNoder::MakeGrad(root);
            lossGrad.Compute(gold, root, root->grad, NULL, loss);
        }
-    }
+    }*/
    
    /* back-propagation from output to input */
    for(int i = nodes.count - 1; i >= 0; i--){
@@ -266,6 +266,8 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XFuncGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
+        else if(XLossGrad::IsLossOP(node))
+			XLossGrad::MakeGrad(node, isEfficent);
        else{
            ShowNTErrors("Wrong node type!");
        }
@@ -300,7 +302,7 @@ depth-first search (Tarjan's algorithm)
 */
 void XNet::Traverse(XTensor &root)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(&root);

    Traverse(roots);
@@ -311,7 +313,7 @@ traverse the net and find the topological order by
 depth-first search (Tarjan's algorithm) 
 >> roots - a list of roots (or output nodes)
 */
-void XNet::Traverse(XList &roots)
+void XNet::Traverse(TensorList &roots)
 {
    id = MakeNetID();
    nodes.Clear();
@@ -336,7 +338,7 @@ depth-first search given a node (Tarjan's algorithm for topological ordering)
 >> orders - topological order of the nodes
 >> code - code of the network
 */
-void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+void XNet::TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code)
 {
    if(node == NULL)
        return;
@@ -444,7 +446,7 @@ show network topology
 */
 void XNet::ShowNetwork(FILE * file, XTensor * node)
 {
-    XList roots(1);
+    TensorList roots(1);
    roots.Add(node);

    Traverse(roots);
@@ -464,9 +466,9 @@ search for a node in a top-down manner by its name
 >> top - the top most node
 << return - the node we found
 */
-XTensor * XNet::SearchNode(XTensor * top, const char * name)
-{
-    return XLink::SearchNode(top, name);
-}
+//XTensor * XNet::SearchNode(XTensor * top, const char * name)
+//{
+	//return XLink::SearchNode(top, name);
+//}

 }
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -23,6 +23,7 @@

 #include "../tensor/XTensor.h"
 #include "../tensor/function/FHeader.h"
+#include "../tensor/loss/LHeader.h"

 #ifndef __XNET_H__
 #define __XNET_H__
@@ -36,16 +37,16 @@ struct XNet
    unsigned int id;

    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;

    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;

    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;

    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;

    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -70,15 +71,15 @@ struct XNet

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -92,10 +93,10 @@ struct XNet

    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);

    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);

    /* dump network information */
    void Dump(FILE * file);
@@ -113,8 +114,8 @@ struct XNet
    void ShowNetwork(FILE * file, XTensor * node);

    /* search a node in a top-down manner by its name */
-    static
-    XTensor * SearchNode(XTensor * top, const char * name);
+    //static
+    //XTensor * SearchNode(XTensor * top, const char * name);
 };

 /* we make a unique id for every tensor */

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -247,13 +247,13 @@ void Check(FNNModel &model)
 /* make a hard copy of the fnn model */
 void Copy(FNNModel &tgt, FNNModel &src)
 {
-    InitTensor(&tgt.embeddingW, &src.embeddingW);
+    InitTensorV2(&tgt.embeddingW, &src.embeddingW);
    for(int i = 0; i < MAX_HIDDEN_NUM; i++){
-        InitTensor(&tgt.hiddenW[i], &src.hiddenW[i]);
-        InitTensor(&tgt.hiddenB[i], &src.hiddenB[i]);
+        InitTensorV2(&tgt.hiddenW[i], &src.hiddenW[i]);
+        InitTensorV2(&tgt.hiddenB[i], &src.hiddenB[i]);
    }
-    InitTensor(&tgt.outputW, &src.outputW);
-    InitTensor(&tgt.outputB, &src.outputB);
+    InitTensorV2(&tgt.outputW, &src.outputW);
+    InitTensorV2(&tgt.outputB, &src.outputB);

    tgt.n = src.n;
    tgt.eSize = src.eSize;
@@ -310,7 +310,7 @@ initialize a 1d tensor using the fnn model setting
 */
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model)
 {
-    InitTensor1D(&tensor, num, X_FLOAT, model.devID, model.mem);
+    InitTensor1DV2(&tensor, num, X_FLOAT, model.devID);
 }

 /* 
@@ -322,7 +322,7 @@ initialize a 2d tensor using the fnn model setting
 */
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, model.devID, model.mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, model.devID);
 }


@@ -449,6 +449,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the gold standard */
            XTensor gold;

+            /* the loss tensor */
+            XTensor lossTensor;
+
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -466,6 +469,8 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
                /* forward computation */
                Forward(inputs, output, model, net);

+
+
                /* backward computation to obtain gradients */
                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);

@@ -483,9 +488,11 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
 				
 				/* this is implemented by multiply function */
 				//ForwardAutoDiff(inputs, output, model);
+                lossTensor = CrossEntropy(output, gold);

                /* automatic differentiation */
-                autoDiffer.Backward(output, gold, CROSSENTROPY);
+                autoDiffer.Backward(lossTensor);
+                //autoDiffer.Backward(output, gold, CROSSENTROPY);

                /* update model parameters */
                Update(model, grad, learningRate, true);
@@ -494,7 +501,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* get probabilities */
            float prob = GetProb(output, gold);
            
-            loss += -prob;
+            prob = ReduceSumAll(lossTensor);
+
+            loss += prob;
            wordCount += ngramNum;
            wordCountTotal += ngramNum;
            
@@ -537,8 +546,8 @@ update the model parameters using the delta rule
 */
 void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 {
-    XList paraList(10);
-    XList gradList(10);
+    TensorList paraList(10);
+    TensorList gradList(10);

    paraList.Add(&model.outputW);
    paraList.Add(&model.outputB);
@@ -595,14 +604,14 @@ get prediction probabilites of the gold words
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 {
    XTensor probs;
-    InitTensor(&probs, &output);
+    InitTensorV2(&probs, &output);
    
    /* probs[i,j] = output[i,j] * gold[i,j] */
    _Multiply(&output, &gold, &probs);

    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output.GetDim(0), output.dataType, output.devID, output.mem);
+    InitTensor1DV2(&wprobs, output.GetDim(0), output.dataType, output.devID);
    _ReduceSum(&probs, &wprobs, 1);
    if(wordProbs != NULL)
        _CopyValues(&wprobs, wordProbs);
@@ -616,7 +625,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
 
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output.devID, output.mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output.devID);
    _ReduceSum(&probs, &result, 1);
    
    return result.Get1D(0);
@@ -718,7 +727,7 @@ The indexed cell is set to 1, and 0 otherwise.
 void InitZeroOneTensor2D(XTensor &tensor, int rowNum, int colNum, int * rows, int * cols, 
                         int itemNum, int devID, XMem * mem)
 {
-    InitTensor2D(&tensor, rowNum, colNum, X_FLOAT, devID, mem);
+    InitTensor2DV2(&tensor, rowNum, colNum, X_FLOAT, devID);

    tensor.SetZeroAll();

@@ -765,7 +774,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
    int batchSize = -1;
    int n = model.n;
    int depth = model.hDepth;
-    XList eList(n - 1);
+    TensorList eList(n - 1);

    /* previoius n - 1 words */
    for(int i = 0; i < n - 1; i++){
@@ -811,7 +820,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)

        /* make a 2d tensor for the bias term */
        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);

        /* introduce bias term:
@@ -843,7 +852,7 @@ void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net)
        _MatrixMul(&h_last, X_NOTRANS, &w, X_NOTRANS, &s);

        XTensor b2D;
-        InitTensor(&b2D, &s);
+        InitTensorV2(&b2D, &s);
        _Unsqueeze(&b, &b2D, 0, batchSize);

        _Sum(&s, &b2D, &s);
@@ -908,8 +917,8 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    XTensor dedsHidden;
    XTensor dedxBottom;
    if (depth > 0)
-        InitTensor(&dedsHidden, &dedx);
-    InitTensor(&dedxBottom, &net.embeddingCat);
+        InitTensorV2(&dedsHidden, &dedx);
+    InitTensorV2(&dedxBottom, &net.embeddingCat);

    /* back-propagation from top to bottom in the stack of hidden layers
       for each layer, h = f(s)
@@ -943,11 +952,11 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
            _CopyValues(&dedx, &gradPassed);
    }

-    XList eList(n - 1);
+    TensorList eList(n - 1);

    /* back-propagation for the embedding layer */
    for (int i = 0; i < n - 1; i++) {
-        XTensor * dedy = NewTensor2D(batchSize, model.eSize, X_FLOAT, model.devID, model.mem);
+        XTensor * dedy = NewTensor2DV2(batchSize, model.eSize, X_FLOAT, model.devID);
        eList.Add(dedy);
    }

@@ -999,7 +1008,7 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        }
    }

-    InitTensor1D(&words, size, X_INT, model.devID, model.mem);
+    InitTensor1DV2(&words, size, X_INT, model.devID);
    words.SetData(index, size);

    embeddingBig = Gather(model.embeddingW, words);
@@ -1017,7 +1026,8 @@ void ForwardAutoDiff(NGram * ngrams, int batch, XTensor &output, FNNModel &model
        hidden = HardTanH(MMul(hidden, model.hiddenW[i]) + model.hiddenB[i]);

    /* output layer */
-    output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    //output = LogSoftmax(MMul(hidden, model.outputW) + model.outputB, 1);
+    output = Softmax(MMul(hidden, model.outputW) + model.outputB, 1);
 }

 /*
@@ -1036,7 +1046,7 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
    XTensor hidden;
    XTensor b;

-    XList inputList(n - 1);
+    TensorList inputList(n - 1);
    for(int i = 0; i < n - 1; i++)
        inputList.Add(inputs + i);

@@ -1177,7 +1187,7 @@ void Test(const char * test, const char * result, FNNModel &model)

        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, ngramNum);
+        InitTensor1DV2(&probs, ngramNum);

        /* get probabilities */
        float prob = GetProb(output, gold, &probs);

--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -127,7 +127,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    XTensor q2;
    XTensor v2;
    XTensor kqv2;
-    XList split;
+    TensorList split;
    
    kqv2 = MMul(kqv, wbig);
    

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -85,7 +85,7 @@ void T2TModel::InitModel(int argc, char ** argv)
    if(isMT)
        decoder->InitModel(argc, argv, true, 0, devID, mem);

-    XList params(10);
+    TensorList params(10);
    GetParams(params);

    for(int i = 0; i < params.count; i++){
@@ -403,7 +403,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
 get parameter matrics
 >> list - the list that keeps the parameter matrics
 */
-void T2TModel::GetParams(XList &list)
+void T2TModel::GetParams(TensorList &list)
 {
    list.Clear();
    list.Add(&outputLayer->w);
@@ -465,7 +465,7 @@ void T2TModel::Dump(const char * fn)
    FILE * file = fopen(fn, "wb");
    CheckNTErrors(file, "Cannot open the model file");

-    XList params(100);
+    TensorList params(100);

    GetParams(params);

@@ -489,7 +489,7 @@ void T2TModel::Read(const char * fn)
    FILE * file = fopen(fn, "rb");
    CheckNTErrors(file, "Cannot open the model file");

-    XList params(100);
+    TensorList params(100);

    GetParams(params);


--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -98,7 +98,7 @@ public:
                       XTensor &maskDec, XTensor &maskEncDec);

    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);

    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -93,9 +93,8 @@ void T2TOutput::Make(XTensor &input, XTensor &output)
 {
    XTensor &x = input;

-    output = LogSoftmax(MMul(x, w), -1);
-    //output = Softmax(MMul(x, w), -1);
-    
+    //output = LogSoftmax(MMul(x, w), -1);
+    output = Softmax(MMul(x, w), -1);
    output.SetName(OUTPUT_NAME);
 }


--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -174,12 +174,13 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    _SetDataFixedInt(&first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
-    if(inputLast == NULL){
+    if (inputLast == NULL) {
        inputDec = Identity(first);
    }
    else{
        inputDec = GeneratePaths(s);
        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
+
        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
    }


--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
@@ -96,10 +96,10 @@ public:

    /* layers on the encoder side. We actually use the encoder output instead
       of all hidden layers. */
-    XList layersEnc;
+    TensorList layersEnc;

    /* layers on the decoder side */
-    XList layersDec;
+    TensorList layersDec;

    /* list of states */
    T2TState * states;

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
@@ -197,12 +197,13 @@ void T2TSearch::Score(T2TStateBundle * prev, T2TStateBundle * beam)

    prob.Reshape(prob.unitNum/outputSize, outputSize);
    score.Reshape(score.unitNum/outputSize, outputSize);
-    probPath.Reshape(score.unitNum/outputSize, outputSize);
+    probPath.Reshape(score.unitNum / outputSize, outputSize);
    probPathPrev.Reshape(probPathPrev.unitNum);

    /* the log-scale probability of the entire sequence */
    _SumDim(&prob, &probPathPrev, &probPath, 0);

+
    InitTensor(&len, &lenPrev);
    InitTensor(&lp, &lenPrev);

@@ -302,7 +303,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    /* Then, we do something similar to "preID". For the top-k predictions, we need 
       to know their indices in the vocabulary. We compute the offset of each prediction
       in the vocabulary by dividing it with vocab-size and computing the remainder. */
-    Mod(index, sizeVocab);
+    _ModMe(index, sizeVocab);

    score.Reshape(order, dims);

@@ -315,18 +316,19 @@ void T2TSearch::Generate(T2TStateBundle * beam)
    InitTensor(&indexCPU, index.order, index.dimSize, index.dataType, index.denseRatio, -1);
    CopyValues(index, indexCPU);

-    for(int i = 0; i < indexCPU.unitNum; i++)
+
+    for (int i = 0; i < indexCPU.unitNum; i++)
        indexCPU.SetInt(i * stride + indexCPU.GetInt(i), i);

    CheckNTErrors(XTensor::IsSameShaped(&prob, &probPath), "Wrong tensor shape!");

-    /* sequence probability and prediction probability of top-k candidates */
+    /* sequence probability of top-k candidates */
    XTensor probPathTopK;
    InitTensor(&probPathTopK, &scoreTopK);
    XTensor probTopK;
    InitTensor(&probTopK, &scoreTopK);

-    for(int i = 0; i < probPath.order; i++){
+    for (int i = 0; i < probPath.order; i++) {
        dims[i] = probPath.GetDim(i);
        dimsTopK[i] = probPathTopK.GetDim(i);
    }
@@ -342,6 +344,7 @@ void T2TSearch::Generate(T2TStateBundle * beam)

    probPath.Reshape(order, dims);
    probPathTopK.Reshape(order, dimsTopK);
+
    prob.Reshape(order, dims);
    probTopK.Reshape(order, dimsTopK);

@@ -396,7 +399,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
       modification of the states. An alternative is to do this on GPUs but 
       it needs much more coding work and the speed-up is not obvious. */
    for(int i = 0; i < beam->stateNum; i += beamSize){
-        for(int j = 0; j < beamSize; j++){
+        for (int j = 0; j < beamSize; j++) {
            int k = i + j;
            T2TState & state = states[k];

@@ -413,7 +416,7 @@ void T2TSearch::Expand(T2TStateBundle * prev, T2TStateBundle * beam)
                state.nstep = 0;
                state.isCompleted = false;
            }
-            else{
+            else {
                state.last = last;
                state.pid = state.last->pid;
                state.nstep = last->nstep + 1;
@@ -517,7 +520,7 @@ void T2TSearch::Dump(XTensor * output)

            /* we track the state from the end to the beginning */
            while(state != NULL){
-                if(!state->isCompleted)
+                if (!state->isCompleted)
                    isCompleted = false;
                if (isCompleted)
                    words[count++] = -1;
@@ -589,7 +592,7 @@ XTensor T2TSearch::MakeFirstMask(T2TStateBundle * beam)
    mask.SetZeroAll();

    for (int i = 0; i < mask.unitNum; i++) {
-        if(i % beamSize != 0)
+        if (i % beamSize != 0)
            mask.Set(-1e9, i);
    }


--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -154,8 +154,8 @@ void T2TTester::Dump(FILE * file, XTensor * output)
 {
    int seqLength = output->GetDim(-1);

-    for(int i = 0; i < output->unitNum; i += seqLength){
-        for(int j = 0; j < seqLength; j++){
+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
            int w = output->GetInt(i + j);
            fprintf(file, "%d ", w);
            if (w < 0)

--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -24,6 +24,7 @@
 #include "T2TUtility.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/core/CHeader.h"
+#include "../../tensor/loss/LHeader.h"
 #include "../../network/XNoder.h"

 #ifndef WIN32
@@ -209,13 +210,16 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            labelOnehot = IndexToOnehot(label, vSizeTgt, labelSmoothingP);
            
            /* make paddings for the output */
-            if (output.GetDim(0) > 0)
-                PadOutput(&output, &labelOnehot, &paddingDec);
+            //if (output.GetDim(0) > 0)
+                //PadOutput(&output, &labelOnehot, &paddingDec);

            /* get probabilities */
-            float prob = GetProb(&output, &labelOnehot, NULL);
+            //float prob = GetProb(&output, &labelOnehot, NULL);
+            XTensor lossTensor;
+            lossTensor = CrossEntropy(output, labelOnehot, paddingDec);
+            float prob = ReduceSumAll(lossTensor);

-            DTYPE lossLocal = -prob / wc;
+            DTYPE lossLocal = prob / wc;
            bool doUpdate = (!IsNAN(lossLocal) && !IsINF(lossLocal) && lossLocal < 1e3F);
          
            //XTensor &g = labelSmoothingP > 0 ? goldSmoothed : gold;  
@@ -223,14 +227,15 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (doUpdate) {
                
                /* recale the output for normalized loss */
-                RescaleOutput(&output, &labelOnehot, &paddingDec);
+                //RescaleOutput(&output, &labelOnehot, &paddingDec);
                
                /* back-propagation */
-                net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
+                net.Backward(lossTensor);
+                //net.Backward(output, labelOnehot, paddingDec, CROSSENTROPY);
                //net.Backward(output, label, labelSmoothingP, CROSSENTROPY);
                
                gradStep += 1;
-                loss += -prob;
+                loss += prob;
                wordCount += wc;
                wordCountTotal += wc;
                
@@ -260,7 +265,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
            if (step % 100 == 0) {
                double elapsed = GetClockSec() - startT;
                XPRINT8(0, stderr, "[INFO] elapsed=%.1fs, step=%d, epoch=%d, tword=%d, sword=%d, loss=%.3f, ppl=%.3f, sppl=%.3f",
-                        elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(-prob/wc));
+                        elapsed, step, epoch, wordCountTotal, wordCountBatch, loss/wordCount, exp(loss/wordCount), exp(prob/wc));
                if (!doUpdate)
                    XPRINT(0, stderr, " (no update)");
                XPRINT(0, stderr, "\n");
@@ -491,7 +496,7 @@ where
 */
 void T2TTrainer::Update(T2TModel * model, const float lr)
 {
-    XList ws(100);
+    TensorList ws(100);

    model->GetParams(ws);

@@ -552,7 +557,7 @@ void T2TTrainer::PrepareModel(T2TModel * model)
    moments.Clear();
    moments2nd.Clear();

-    XList ws(100);
+    TensorList ws(100);

    model->GetParams(ws);


--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -82,10 +82,10 @@ public:
    float adamBeta2T;

    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;

    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;

    /* indicates whether the data file is shuffled for training */
    bool isShuffled;

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -30,6 +30,7 @@
 #include "XDevice.h"
 #include "./test/Test.h"
 #include "./core/CHeader.h"
+#include "./loss/CrossEntropy.h"

 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>  

--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -476,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();

@@ -530,7 +530,7 @@ get device ids for the given device information
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);

    for(int i = 0; i < terms->count; i++){

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -49,7 +49,7 @@ namespace nts {

 #ifdef DOUBELPRICSION
 #define DTYPE double
-#define DTYPE_MIN (DTYPE)1.79E+308
+#define DTYPE_MIN (DTYPE)-1.79E+308
 #else
 #define DTYPE float
 #define DTYPE_MIN (DTYPE)-3.40E+38

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
    
-    XList list(2);
-    list.Add(t1);
-    list.Add(t2);
+    TensorList list(2);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);

    MakeLink(&list, h, id);
 }
@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
    if (h == NULL)
        return;

-    XList list(3);
-    list.Add(t1);
-    list.Add(t2);
-    list.Add(t3);
+    TensorList list(3);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+    list.Add((XTensor*)t3);

    MakeLink(&list, h, id);
 }
@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
    /* forward */
    for(int i = 0; i < list->count; i++){
@@ -528,10 +528,90 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
            CheckNTErrors(hit, "No proper node found in parent.income edge!");
        }
    }
-
-    strcpy(newOne->name, oldOne->name);
 }

+
+/*
+copy a node with another, i.e., we add the links to the new node
+>> src - the node to be copied
+>> tgt - the new node
+*/
+void XLink::Copy(const XTensor * reference, XTensor * target)
+{
+    if (reference == NULL || target == NULL)
+        return;
+
+    XLink &newIncome = target->income;
+    XLink &newOutgo = target->outgo;
+
+    XLink::ClearOutgoing(target);
+    XLink::ClearIncoming(target);
+
+    /* incoming nodes */
+    if (reference->income.typeID != 0) {
+        if (newIncome.tailNum < reference->income.tailNum) {
+            delete[] newIncome.tails;
+            newIncome.tails = new XTensor*[reference->income.tailNum];
+        }
+
+        newIncome.SetType(reference->income.typeID);
+        newIncome.head = target;
+        newIncome.tailNum = reference->income.tailNum;
+        memcpy(newIncome.tails, reference->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+
+        int paraArraySize = reference->income.paramNum * reference->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, reference->income.params, paraArraySize);
+        newIncome.paramNum = reference->income.paramNum;
+
+        /* update the link to each child node */
+        for (int i = 0; i < newIncome.tailNum; i++) {
+            XTensor * child = newIncome.tails[i];
+            XLink &childOutgo = child->outgo;
+            bool hit = false;
+            for (int j = 0; j < childOutgo.tailNum; j++) {
+                if (childOutgo.tails[j] == reference) {
+                    //childOutgo.tails[j] = target;
+                    childOutgo.AddTail(target);
+                    hit = true;
+                    break;
+                }
+            }
+
+            if (childOutgo.tailNum > 0) {
+                CheckNTErrors(hit, "No proper node found in child.outgo edge!");
+            }
+        }
+    }
+
+    if (newOutgo.tailNum < reference->outgo.tailNum) {
+        delete[] newOutgo.tails;
+        newOutgo.tails = new XTensor*[reference->outgo.tailNum];
+    }
+
+    /* outgoing nodes */
+    newOutgo.head = target;
+    newOutgo.tailNum = reference->outgo.tailNum;
+    memcpy(newOutgo.tails, reference->outgo.tails, sizeof(XTensor*) * newOutgo.tailNum);
+
+    /* update the link to each parent node */
+    for (int i = 0; i < newOutgo.tailNum; i++) {
+        XTensor * parent = newOutgo.tails[i];
+        XLink &parentIncome = parent->income;
+        bool hit = false;
+        for (int j = 0; j < parentIncome.tailNum; j++) {
+            if (parentIncome.tails[j] == reference) {
+                //parentIncome.tails[j] = target;
+                parentIncome.AddTail(target);
+                hit = true;
+            }
+        }
+
+        if (parentIncome.tailNum > 0) {
+            CheckNTErrors(hit, "No proper node found in parent.income edge!");
+        }
+    }
+}
 /* 
 copy incoming edges of a given node
 >> reference - the node we copy from
@@ -544,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);

    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -33,7 +33,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* cross reference */
 struct XTensor;

-#define MAX_OP_NAME_LENGTH 16
+#define MAX_OP_NAME_LENGTH 64
 #define PARAM_UNTI_SIZE    64

 /*
@@ -144,11 +144,11 @@ struct XLink

    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);

    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);

    /* add a parameter */
    static
@@ -174,6 +174,10 @@ struct XLink
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);

+    /* copy a node with another, i.e., we add the links to the new node */
+    static
+    void Copy(const XTensor * reference, XTensor * target);
+
    /* copy links of a given node */
    static
    void CopyIncoming(const XTensor * reference, XTensor * target);

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,43 +15,34 @@
 * limitations under the License.
 */

-/*
+ /*
  *
- * Implementation of list that keeps data items
+  * Implementation of template list that keeps data items
  *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
+  * $Created by: HU Chi (huchinlp@foxmail.com)
  *
  */

-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
 #include "XList.h"
+#include "XMem.h"
 #include "XGlobal.h"

-#include "wchar.h"
-#include "locale.h"
-#if !defined( WIN32 ) && !defined( _WIN32 )
-    #include "sys/time.h"
-    #include "time.h"
-    #include "iconv.h"
-#else
-    #include "time.h"
-#endif
+#include <ctime>
+#include <utility>
+#include <algorithm>

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{

-XList NULLList;
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {

 /* constructor */
-XList::XList()
+template <typename T>
+TensorListBase<T>::TensorListBase()
 {
    mem = NULL;
-    maxNum = 8;
+    maxNum = 0;
    count = 0;
-    items  = new void*[8];
-    isIntList = false;
+    items = NULL;
 }

 /* 
@@ -59,13 +50,13 @@ constructor
 >> myMaxNum - maximum number of items to keep
 >> isIntListOrNot - specify if the list keeps int items
 */
-XList::XList(int myMaxNum, bool isIntListOrNot)
+template <typename T>
+TensorListBase<T>::TensorListBase(int myMaxNum)
 {
    mem = NULL;
    maxNum = myMaxNum;
    count = 0;
-    items  = new void*[myMaxNum];
-    isIntList = isIntListOrNot;
+    items = new T[myMaxNum];
 }

 /* 
@@ -74,63 +65,64 @@ constructor
 >> myMem - the memory pool used for data allocation
 >> isIntListOrNot - specify if the list keeps int items
 */
-XList::XList(int myMaxNum, XMem * myMem, bool isIntListOrNot)
+template <typename T>
+TensorListBase<T>::TensorListBase(int myMaxNum, XMem* myMem)
 {
    mem = myMem;
    maxNum = myMaxNum;
    count = 0;
-    items  = (void**)mem->Alloc(mem->devID, sizeof(void*) * maxNum);
-    isIntList = isIntListOrNot;
+    items = (T*)mem->Alloc(mem->devID, sizeof(T) * maxNum);
 }

 /* de-constructor */
-XList::~XList()
+template <typename T>
+TensorListBase<T>::~TensorListBase()
 {
-    if(isIntList){
-        for(int i = 0; i < count; i++){
-            int * p = (int*)items[i];
-            delete[] p;
-        }
-    }
-    if(mem == NULL)
    delete[] items;
 }

+
 /*
-allocate the data array for the list
->> myMaxNum - maximum number of items to keep
->> isIntListOrNot - specify if the list keeps int items
+add an item into the list
+>> item - a right value
 */
-void XList::Create(int myMaxNum, XMem * myMem)
+template <typename T>
+void TensorListBase<T>::Add(T&& item)
 {
-    mem    = myMem;
-    maxNum = myMaxNum;
-    count  = 0;
-    items  = (void**)mem->Alloc(mem->devID, sizeof(void*) * maxNum);
+    if (count == maxNum) {
+		
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }
+    items[count++] = item;
+	
 }

 /*
 add an item into the list
->> item - pointer to the item
+>> item - a const reference to the item
 */
-void XList::Add(const void * item)
+template <typename T>
+void TensorListBase<T>::Add(const T& item)
 {
-    if( count == maxNum ){
-        void ** newItems;
-        if( mem == NULL )
-            newItems = new void*[maxNum * 2 + 1];
+	if (count == maxNum) {
+		T* newItems;
+		if (mem == NULL)
+			newItems = new T[maxNum * 2 + 1];
 		else
-            newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * (maxNum * 2 + 1));
-        memcpy(newItems, items, sizeof(void*) * maxNum);
-        if( mem == NULL )
-            delete[] items;
+			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+		memcpy(newItems, items, sizeof(T) * maxNum);
 		items = newItems;
 		maxNum = maxNum * 2 + 1;
 	}

-    MTYPE p = (MTYPE)item;
-    items[count++] = (MTYPE*)p;
-
+	items[count++] = item;
 }

 /* 
@@ -138,22 +130,21 @@ add a number of items into the list
 >> inputItems - pointer to the array of items
 >> inputItemCount - number of input items
 */
-void XList::Add(void ** inputItems, int inputItemCount)
+template <typename T>
+void TensorListBase<T>::Add(T* inputItems, int inputItemCount)
 {
-    if( count + inputItemCount >= maxNum ){
+    if (count + inputItemCount >= maxNum) {
        int newMaxNum = (count + inputItemCount) * 2 + 1;
-        void ** newItems;
-        if( mem == NULL )
-            newItems = new void*[newMaxNum];
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[newMaxNum];
        else
-            newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * newMaxNum);
-        memcpy(newItems, items, sizeof(void*) * maxNum);
-        if( mem == NULL )
-            delete[] items;
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * newMaxNum);
+        memcpy(newItems, items, sizeof(T) * maxNum);
        items = newItems;
        maxNum = newMaxNum;
    }
-    memcpy(items + count, inputItems, sizeof(void*) * inputItemCount);
+    memcpy(items + count, inputItems, sizeof(T) * inputItemCount);
    count += inputItemCount;
 }

@@ -161,83 +152,82 @@ void XList::Add(void ** inputItems, int inputItemCount)
 append a list to the current list
 >> l - the list we use to append
 */
-void XList::AddList(XList * l)
+template <typename T>
+void TensorListBase<T>::AddList(TensorListBase* l)
 {
    Add(l->items, l->count);
 }

 /*
-add an integer-typed item into the list
->> item - pointer to the item
+insert an item to the given position of the list
+>> pos - the position
+>> item - the item for insertion
 */
-void XList::AddInt(int i)
+template <typename T>
+void TensorListBase<T>::Insert(int pos, const T& item)
 {
-    CheckNTErrors(isIntList, "An int list is required!");
+    if (count == maxNum) {
+        T* newItems;
+        if (mem == NULL)
+            newItems = new T[maxNum * 2 + 1];
+        else
+            newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+        memcpy(newItems, items, sizeof(T) * maxNum);
+        items = newItems;
+        maxNum = maxNum * 2 + 1;
+    }

-    int * a = new int[1];
-    *a = i;
-    Add(a);
+    for (int i = count - 1; i >= pos; i--)
+        items[i + 1] = items[i];
+    items[pos] = item;
+    count++;
 }

-/*
-insert an item to the given position of the list
->> pos - the position
->> item - the item for insertion
-*/
-void XList::Insert(int pos, void * item)
+template<typename T>
+void TensorListBase<T>::Insert(int pos, T&& item)
 {
-    if( count == maxNum ){
-        void ** newItems;
-        if( mem == NULL )
-            newItems = new void*[maxNum * 2 + 1];
+	if (count == maxNum) {
+		T* newItems;
+		if (mem == NULL)
+			newItems = new T[maxNum * 2 + 1];
 		else
-            newItems = (void**)mem->Alloc(mem->devID, sizeof(void*) * (maxNum * 2 + 1));
-        memcpy(newItems, items, sizeof(void*) * maxNum);
-        if( mem == NULL )
-            delete[] items;
+			newItems = (T*)mem->Alloc(mem->devID, sizeof(T) * (maxNum * 2 + 1));
+		memcpy(newItems, items, sizeof(T) * maxNum);
 		items = newItems;
 		maxNum = maxNum * 2 + 1;
 	}

-    for(int i = count - 1; i >= pos; i--)
+	for (int i = count - 1; i >= pos; i--)
 		items[i + 1] = items[i];
 	items[pos] = item;
 	count++;
 }

 /* get the item at position i */
-void * XList::GetItem(int i) const
+template <typename T>
+T& TensorListBase<T>::GetItem(int i) const
 {
    CheckNTErrors(i >= -1 && i < count, "Index of a list item is out of scope!");
    CheckNTErrors(count > 0, "Cannt index the item in an empty list!");
-    if(i == -1)
+    if (i == -1)
        return items[count - 1];
    else
        return items[i];
 }

-/* get the integer-typed item at position i */
-int XList::GetItemInt(int i)
-{
-    CheckNTErrors(isIntList, "An int list is required!");
-    CheckNTErrors(i >= 0 && i < count, "Index of a list item is out of scope!");
-    return *(int*)(items[i]);
-}
-
 /* set the item at position i */
-void XList::SetItem(int i, void * item)
+template <typename T>
+inline void TensorListBase<T>::SetItem(int i, const T& item)
 {
-     if( i >= 0 && i < count )
+    if (i >= 0 && i < count)
        items[i] = item;
 }

-/* get the integer-typed item at position i */
-void XList::SetItemInt(int i, int item)
+template<typename T>
+inline void TensorListBase<T>::SetItem(int i, T&& item)
 {
-    CheckNTErrors(isIntList, "An int list is required!");
-
-    if( i >= 0 && i < count )
-        *(int*)(items[i]) = item;
+	if (i >= 0 && i < count)
+		items[i] = std::move(item);
 }

 /* 
@@ -246,37 +236,28 @@ find the position of the first matched item
 << the position where we hit the item (if any)
 */

-int XList::FindFirst(void * item)
+template <typename T>
+inline int TensorListBase<T>::FindFirst(const T& item)
 {
-    for(int i = 0;i < count; i++){
-        if(item == items[i])
+    for (int i = 0; i < count; i++) {
+        if (item == items[i])
            return i;
    }
    return -1;
 }

 /* clear the data array */
-void XList::Clear()
+template <typename T>
+void TensorListBase<T>::Clear()
 {
-    if(isIntList){
-        for(int i = 0; i < count; i++){
-            delete[] (int*)items[i];
-        }
-        count = 0;
-    }
-    else
 	count = 0;
 }

-/* delete the data array as well as the string arrays kept in it */
-void XList::ClearStringList()
-{
-    if(mem == NULL){
-        for(int i = 0; i < count; i++){
-            delete[] (char*)items[i];
-        }
-    }
-    count = 0;
+/*
+compare function for two elements
+*/
+int Compare(const void* a, const void* b) {
+    return (*(int*)(a)-*(int*)(b));
 }

 /* 
@@ -284,29 +265,32 @@ sort the list
 >> itemSize - size of an item
 >> comp - the comparison function used in sorting
 */
-void XList::Sort(int itemSize, ListCompare comp)
+template <typename T>
+void TensorListBase<T>::Sort(int itemSize)
 {
-    qsort(items, count, itemSize, comp);
+    qsort((void*)items, count, itemSize, Compare);
 }

 /* reverse the list */
-void XList::Reverse()
+template <typename T>
+inline void TensorListBase<T>::Reverse()
 {
-    int half = count/2;
-    for(int i = 0; i < half; i++){
-        void * tmp = items[i];
+    int half = count / 2;
+    for (int i = 0; i < half; i++) {
+        T tmp(items[i]);
        items[i] = items[count - i - 1];
        items[count - i - 1] = tmp;
    }
 }

 /* remove the item at position i */
-void XList::Remove(int i)
+template <typename T>
+void TensorListBase<T>::Remove(int i)
 {
-    if(i >= count || i < 0)
+    if (i >= count || i < 0)
        return;

-    memcpy(items + i, items + i + 1, sizeof(void*) * (count - i - 1));
+    memcpy(items + i, items + i + 1, sizeof(T*) * (count - i - 1));

    count--;
 }
@@ -316,10 +300,11 @@ copy the list
 >> myMem - memory pool used for allocating the data in the new list
 << hard copy of the list
 */
-XList * XList::Copy(XMem * myMem)
+template <typename T>
+TensorListBase<T>* TensorListBase<T>::Copy(XMem* myMem)
 {
-    XList * newList = new XList(maxNum, myMem);
-    for(int i = 0; i < count; i++){
+    TensorListBase<T>* newList = new TensorListBase<T>(maxNum, myMem);
+    for (int i = 0; i < count; i++) {
        newList->Add(GetItem(i));
    }
    return newList;
@@ -331,29 +316,39 @@ shuffle the list
 >> beg - where we start
 >> len - how many items are used in shuffling
 */
-void XList::Shuffle(int nround, int beg, int len)
+template <typename T>
+void TensorListBase<T>::Shuffle(int nround, int beg, int len)
 {
-    if(beg < 0){
+    if (beg < 0) {
        beg = 0;
        len = count;
    }

-    if(beg + len > count)
+    if (beg + len > count)
        return;

    srand((unsigned int)time(NULL));

-    for(int k = 0; k < nround; k++){
-        /* Fisher¨CYates shuffle */
-        for(int i = 0; i < len; i++){
-            float a = (float)rand()/RAND_MAX;
-            size_t j = (unsigned int) (a*(i+1));
-            void* t = items[beg + j];
+    for (int k = 0; k < nround; k++) {
+        /* Fisher CYates shuffle */
+        for (int i = 0; i < len; i++) {
+            float a = (float)rand() / RAND_MAX;
+            size_t j = (unsigned int)(a * (i + 1));
+            T t = items[beg + j];
            items[beg + j] = items[beg + i];
            items[beg + i] = t;
        }
    }
 }

-} 
-/* end of the nts (NiuTrans.Tensor) namespace */
+/* specializations and typedef of list */
+template struct TensorListBase<int>;
+template struct TensorListBase<char>;
+template struct TensorListBase<char*>;
+template struct TensorListBase<long>;
+template struct TensorListBase<float>;
+template struct TensorListBase<short>;
+template struct TensorListBase<XTensor*>;
+template struct TensorListBase<void*>;
+
+} /* end of the nts (NiuTrans.Tensor) namespace */
\ No newline at end of file
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */

-/*
+ /*
  *
- * Implementation of list that keeps data items
+  * Implementation of template list that keeps data items
  *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
- * The first coding job this year!
+  * $Created by: HU Chi (huchinlp@foxmail.com)
  *
  */

-#ifndef __XLIST_H__
-#define __XLIST_H__
-
 #include "XMem.h"
 #include "XGlobal.h"

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+#ifndef __TensorList_H__
+#define __TensorList_H__
+

-typedef int (* ListCompare)(const void * item1, const void * item2);
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
 	
-/* the XList class */
-class XList
-{
+/* the TensorListBase class */
+template <typename T>
+struct TensorListBase {
 public:
+
    /* data items */
-    void ** items;
+    T *items;

    /* number of items */
    int count;
@@ -49,56 +48,88 @@ public:
    int maxNum;

    /* the memory pool for data array allocation */
-    XMem * mem;
-
-    /* indicates whether data items are integers */
-    bool isIntList;
+    XMem* mem;

 public:
    /* constructor */
-    XList();
+    TensorListBase();

    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);

    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);

    /* de-constructor */
-    ~XList();
-
-    /* utilities */
-    void Create(int myMaxNum, XMem * myMem);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
-    void AddList(XList * l);
-    void AddInt(int i);
-    void Insert(int pos, void * item);
-    void * GetItem(int i) const;   
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
-    void SetItemInt(int i, int item);
-    
-    int FindFirst(void * item);
+    ~TensorListBase();
+
+    /* add an item into the list */
+    void Add(T&& item);
+
+	/* add an item into the list */
+	void Add(const T& item);
+
+	/* add a number of items into the list */
+    void Add(T* inputItems, int inputItemCount);
+
+	/* append a list to the current list */
+    void AddList(TensorListBase* l);
+
+	/* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+
+	/* insert an item to the given position of the list */
+	void Insert(int pos, T&& item);
+
+	/* get the item at position i */
+    T& GetItem(int i) const;
+
+	/* set the item at position i */
+    void SetItem(int i, const T& item);
+
+	/* set the item at position i */
+	void SetItem(int i, T&& item);
+
+	/* find the position of the first matched item  */
+    int FindFirst(const T& item);
+
+	/* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+
+	/* sort the list */
+    void Sort(int itemSize);
+
+	/* reverse the list */
    void Reverse();
+
+	/* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+
+	/* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+
+	/* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
-
+	T& operator[] (int i) {
+		return GetItem(i);
+	};
+    T& Get(int i) { return GetItem(i); };
+	void Set(int i, T item) { SetItem(i, item); };
 };

-extern XList NULLList;
+struct XTensor;
+
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<XTensor*> TensorList;

-} 
-/* end of the nts (NiuTrans.Tensor) namespace */
+} /* end of the nts (NiuTrans.Tensor) namespace */

-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -34,6 +34,11 @@ namespace nts{
 int testxmemid = 0;
 void * recordp = NULL;

+/*
+for managing the memories
+*/
+XMemManager GMems;
+
 XMem * GMem;

 /* constructor */
@@ -1488,4 +1493,158 @@ cublasHandle_t * XMem::GetCublasHandle()

 #endif

+/* constructor */
+XMemManager::XMemManager()
+{
+    Initialize();
+}
+
+/* de-constructor */
+XMemManager::~XMemManager()
+{
+}
+
+/* get memory size */
+MTYPE XMemManager::GetAvailableMemory()
+{
+    unsigned long freeMem = 0;
+#ifndef WIN32
+    long pages = sysconf(_SC_AVPHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    freeMem = pages * page_size;
+#else
+    MEMORYSTATUSEX memoryStatus;
+    memoryStatus.dwLength = sizeof(memoryStatus);
+    if (GlobalMemoryStatusEx(&memoryStatus)){
+        freeMem = memoryStatus.ullAvailPhys;
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get GPU memory size */
+MTYPE XMemManager::GetAvailableGPUMemory(int devID)
+{
+    size_t freeMem = 0;
+    size_t totalMem = 0;
+#ifdef USE_CUDA
+    cudaSetDevice(devID);
+    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
+        XPRINT(0, stderr, "cannot get GPU memory information.");
+        exit(1);
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get buffer size */
+void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
+{
+    *myBufSize = 0;
+    if (freeMem >= MILLION * 128){
+        *myBufSize = MILLION * 32;
+        if (freeMem >= MILLION * 256){
+            *myBufSize = MILLION * 64;
+            if (freeMem >= MILLION * 512){
+                *myBufSize = MILLION * 128;
+                if (freeMem >= MILLION * 1024) {
+                    *myBufSize = MILLION * 256;
+                    if (freeMem >= MILLION * 2048)
+                        *myBufSize = MILLION * 512;
+                }
+            }
+        }
+    }
+} 
+
+/* initialize it and set the global memory information */
+void XMemManager::Initialize()
+{
+    srand((unsigned int)time(NULL));
+
+    Free();
+    
+    /* CPUs (we actually do not care about how many CPUs are using) */
+    nCPUMem = 1;
+
+    MTYPE freeMem = GetAvailableMemory();
+    MTYPE myBufSize = 0;
+    GetBufferSize(freeMem, &myBufSize);
+    CPUMems[0].Initialize(-1, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+
+    /* GPUs */
+    nGPUMem = 0;
+
+#ifdef USE_CUDA
+    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
+        XPRINT(0, stderr, "cannot get GPU information.");
+        exit(1);
+    }
+
+    for (int i = 0; i < nGPUMem; i++) {
+        MTYPE freeMem = GetAvailableGPUMemory(i);
+        MTYPE myBufSize = 0;
+        GetBufferSize(freeMem, &myBufSize);
+        GPUMems[i].Initialize(i, UNI_FREE, MIN_BLOCK_SIZE_FOR_MEMPOOL, MIN_BLOCK_NUM_FOR_MEMPOOL, myBufSize);
+    }
+
+#endif
+}
+
+/* free it */
+void XMemManager::Free()
+{
+    for (int i = 0; i < MAX_CPU_NUM; i++)
+        CPUMems[i].Free();
+    for (int i = 0; i < MAX_GPU_NUM; i++)
+        GPUMems[i].Free();
+}
+
+/* get global memory pool */
+XMem * XMemManager::GetMem(const int devID)
+{
+    XMem * mem = NULL;
+    if (devID < 0)
+        mem = CPUMems;
+    else{
+        if (devID < nGPUMem)
+            mem = GPUMems + devID;
+        else
+            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+    }
+    
+    return mem;
+}
+
+/* get global memory size */
+int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
+{
+    XMem * mem = GetMem(devID);
+    int result = 0;
+    if (mem != NULL){
+        *myBlockSize = mem->maxBlockSize;
+        *myBlockNum = mem->blockNum;
+        *myBufSize = mem->bufSize;
+        result = 1;
+    }
+    return result;
+}
+
+/* show memory information */
+void XMemManager::ShowMemInfo()
+{
+    XPRINT(1, stderr, "Memory Information:\n");
+    MTYPE myBlockSize, myBufSize;
+    int myBlockNum;
+    for(int i = 0; i < nCPUMem; i++){
+        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%d, blockNum:%d, bufSize:%d\n", myBlockSize, myBlockNum, myBufSize);
+    }
+
+    for(int i = 0; i < nGPUMem; i++){
+        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%d, blockNum:%d, bufSize:%d\n", i, myBlockSize, myBlockNum, myBufSize);
+    }
+}
+
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -39,6 +39,12 @@
 #include <curand.h>
 #endif

+#ifndef WIN32
+#include <unistd.h>
+#else
+#include <windows.h>
+#endif
+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

@@ -53,6 +59,8 @@ typedef long long          INT_64;
 #define BUF_PITCH 256
 #define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_NUM 16
+#define MAX_GPU_NUM 16

 /* 
 mode of runnig a memory pool 
@@ -413,6 +421,61 @@ public:

 };

+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+public:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_NUM];
+
+    /* number of cpu memory pools */
+    int nCPUMem;
+
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_NUM];
+
+    /* number of gpu memory pools */
+    int nGPUMem;
+
+public:
+    /* constructor */
+    XMemManager();
+
+    /* de-constructor */
+    ~XMemManager();
+
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+
+    /* initialize it and set the global memory information */
+    void Initialize();
+
+    /* free it */
+    void Free();
+
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+
+    /* show memory information */
+    void ShowMemInfo();
+};
+
+/* managing the memories */
+extern XMemManager GMems;
+
+
+
 extern XMem * GMem;

 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -77,6 +77,12 @@ const char * GetOPName(int type)
            return "M_POWER";
        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if (type == MATH_SCALE)
+            return "M_SCALE";
+        else if (type == MATH_DESCALE)
+            return "M_DESCALE";
+        else if (type == MATH_SHIFT)
+            return "M_SHIFT";
        else if (type == MATH_MULANDSHIFT)
            return "M_OPERATION";
        else if (type == MATH_SIGN)
@@ -111,6 +117,8 @@ const char * GetOPName(int type)
            return "M_COPYVALUES";
        else if (type == MOVEMENT_GATHER)
            return "M_GATHER";
+        else if (type == MOVEMENT_DROPOUTWITHINDEX)
+            return "M_DROPOUTWITHINDEX";
        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
        else if (type == SHAPE_MERGE)
@@ -152,6 +160,10 @@ const char * GetOPName(int type)
        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
+    else if ((type & LOSS_BASE) != 0) {
+        if (type == LOSS_CROSSENTROPY)
+            return "L_CROSSENTROPY";
+    }
    
    return "NULL";
 }

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -58,7 +58,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
 #define MATH_MULANDSHIFT        MATH_SCALEANDSHIFT + 1
-#define MATH_SIGN               MATH_MULANDSHIFT + 1
+#define MATH_SCALE              MATH_MULANDSHIFT + 1
+#define MATH_DESCALE            MATH_SCALE + 1
+#define MATH_SHIFT              MATH_DESCALE + 1
+#define MATH_MOD                MATH_SHIFT + 1
+#define MATH_SIGN               MATH_MOD + 1
 #define MATH_SUB                MATH_SIGN + 1
 #define MATH_SUBDIM             MATH_SUB + 1
 #define MATH_SUM                MATH_SUBDIM + 1
@@ -81,8 +85,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
+#define MOVEMENT_DROPOUTWITHINDEX         MOVEMENT_GATHER + 1

-#define SHAPE                   MOVEMENT_GATHER + 1
+#define SHAPE                   MOVEMENT_DROPOUTWITHINDEX + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
 #define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
@@ -108,6 +113,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define FUNC_SIGMOID            FUNC_RECTIFY + 1
 #define FUNC_SOFTMAX            FUNC_SIGMOID + 1

+#define LOSS_BASE               FUNCTION_BASE * 2
+#define LOSS_CROSSENTROPY       LOSS_BASE + 1
+
 /* get operator name */
 const char * GetOPName(int type);


--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);

            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);

            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();

    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);

    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }

 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)

    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+
+	// warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);

    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }

 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }

 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");


--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;

    /* arguments of the job */
-    XList * args;
+    TensorList * args;

 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;

    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;

    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();

    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);

    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);

    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -564,6 +564,37 @@ bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b, const XTensor *
    return IsSameShaped(a, b) && IsSameShaped(a, c);
 }

+bool XTensor::IsReduceShaped(const XTensor * a, const XTensor * b, int dim)
+{
+    if (a == NULL || b == NULL)
+        return false;
+
+    if ((a->order - 1) != b->order)
+        return false;
+
+    for (int i = 0; i < b->order; i++) {
+        if (i < dim) {
+            if (a->dimSize[i] != b->dimSize[i])
+                return false;
+        }
+        else if (i >= dim) {
+            if (a->dimSize[i+1] != b->dimSize[i])
+                return false;
+        }
+    }
+
+    if(a->dataType != b->dataType)
+        return false;
+
+    if(a->denseRatio != b->denseRatio)
+        return false;
+
+    if(a->isSparse != b->isSparse)
+        return false;
+
+    return true;
+}
+
 /* 
 set the size of each dimension 
 >> myDimSize - size of each dimension
@@ -644,21 +675,21 @@ reshape the tensor by merging two consecutive dimensions
 */
 void XTensor::ReshapeMerged(const int i, const int j)
 {
-    if(i < 0)
+    if (i < 0)
        return;

    int di = i;
-    int dj = j < 0 ? i + 1: j;
+    int dj = j < 0 ? i + 1 : j;

    CheckNTErrors(di < order, "Wrong dimension index!");


    int dims[MAX_TENSOR_DIM_NUM];

-    for(int k = 0; k < di; k++)
+    for (int k = 0; k < di; k++)
        dims[k] = dimSize[k];
    dims[di] = dimSize[di] * dimSize[dj];
-    for(int k = dj + 1; k < order; k++)
+    for (int k = dj + 1; k < order; k++)
        dims[k - 1] = dimSize[k];

    Reshape(order - 1, dims);
@@ -1983,7 +2014,7 @@ void XTensor::FlushToMem(XMem * targetMem)
    if (targetMem->devID >= 0) {
 #ifdef USE_CUDA
        if (devID < 0) {
-            XList l(1);
+            TensorList l(1);
            l.Add(this);
            CudaCPUToGPUFlush(&l, targetMem->devID, targetMem);
        }
@@ -2123,6 +2154,48 @@ void InitTensor(XTensor * tensor,
 }

 /* 
+initialize a dense tensor V2
+>> tensor - the tensor we intend to initialize
+>> myOrder - order of the tensor
+>> myDimSize - the size of each dimension
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDenseRatio - how often an element has non-zero value
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site
+*/
+
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
+                const int myDevID)
+{
+    if(tensor->mem != NULL){
+        tensor->Resize(myOrder, myDimSize, myDataType, 1.0F);
+    }
+    else{
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(dims, myDimSize, sizeof(int) * myOrder);
+
+        bool allocated = true;
+        for (int i = 0; i < myOrder; i++) {
+            if (dims[i] < 0)
+                allocated = false;
+        }
+
+        dims[0] = -abs(dims[0]);
+
+        if (myDevID == CURRENT_GPU)
+            tensor->devID = XDevice::GetGPUDevice();
+        else
+            tensor->devID = myDevID;
+
+        tensor->Resize(myOrder, dims, myDataType, 1.0F);
+
+        if(allocated)
+            XTensor::AllocateData(tensor);
+    }
+}
+
+/* 
 initialize a dense tensor 
 >> tensor - the tensor we intend to initialize
 >> num - number of elements
@@ -2144,6 +2217,24 @@ void InitTensor1D(XTensor * tensor, const int num,
 }

 /* 
+initialize a dense tensor V2
+>> tensor - the tensor we intend to initialize
+>> num - number of elements
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site
+*/
+
+void InitTensor1DV2(XTensor * tensor, const int num,
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[1];
+    dims[0] = num;
+
+    InitTensorV2(tensor, 1, dims, myDataType, myDevID);
+}
+
+/* 
 initialize a dense matrix 
 >> tensor - the tensor we intend to initialize
 >> rowNum - number of rows
@@ -2167,6 +2258,26 @@ void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
 }

 /* 
+initialize a dense matrix V2
+>> tensor - the tensor we intend to initialize
+>> rowNum - number of rows
+>> colNum - number of columns
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site
+*/
+
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[2];
+    dims[0] = rowNum;
+    dims[1] = colNum;
+
+    InitTensorV2(tensor, 2, dims, myDataType, myDevID);
+}
+
+/* 
 initialize a dense 3d tensor 
 >> tensor - the tensor we intend to initialize
 >> d0 - size of dimension 0
@@ -2192,6 +2303,28 @@ void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
 }

 /* 
+initialize a dense 3d tensor V2
+>> tensor - the tensor we intend to initialize
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site
+*/
+
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2, 
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[3];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+
+    InitTensorV2(tensor, 3, dims, myDataType, myDevID);
+}
+    
+/*
 initialize a dense 4d tensor
 >> tensor - the tensor we intend to initialize
 >> d0 - size of dimension 0
@@ -2219,6 +2352,30 @@ void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, co
 }

 /*
+initialize a dense 4d tensor V2
+>> tensor - the tensor we intend to initialize
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> d3 - size of dimension 3
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site
+*/
+
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[4];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+    dims[3] = d3;
+    
+    InitTensorV2(tensor, 4, dims, myDataType, myDevID);
+}
+
+/*
 initialize a dense 5d tensor
 >> tensor - the tensor we intend to initialize
 >> d0 - size of dimension 0
@@ -2248,6 +2405,32 @@ void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, co
 }

 /*
+initialize a dense 5d tensor V2
+>> tensor - the tensor we intend to initialize
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> d3 - size of dimension 3
+>> d4 - size of dimension 4
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDevID - when myMem is NULL, myDevID specifies the device
+             on which we allocate the data on site
+*/
+
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[5];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+    dims[3] = d3;
+    dims[4] = d4;
+    
+    InitTensorV2(tensor, 5, dims, myDataType, myDevID);
+}
+
+/* 
 initialize a tensor with a reference tensor 
 >> tensor - the tensor we intend to initialize
 >> reference - the reference tensor
@@ -2263,6 +2446,20 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
 }

 /* 
+initialize a tensor with a reference tensor V2
+>> tensor - the tensor we intend to initialize
+>> reference - the reference tensor
+*/
+void InitTensorV2(XTensor * tensor, const XTensor * reference)
+{
+    if(reference->order < 0)
+        return;
+
+    InitTensorV2(tensor, reference->order, reference->dimSize, 
+               reference->dataType, reference->devID);
+}
+    
+/*
 initialize a tensor on the CPU with a reference tensor
 >> tensor - the tensor we intend to initialize
 >> reference - the reference tensor
@@ -2310,6 +2507,23 @@ XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_
 }

 /* 
+generate a dense XTensor V2
+>> myOrder - order of the tensor
+>> myDimSize - the size of each dimension
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDenseRatio - how often an element has non-zero value
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
+                      const int myDevID)
+{
+    XMem * myMem = GMems.GetMem(myDevID);
+    return new XTensor(myOrder, myDimSize, myDataType, 1.0F, myDevID, myMem);
+}
+
+/*
 generate a XTensor which allocates data on the buffer 
 >> myOrder - order of the tensor
 >> myDimSize - the size of each dimension
@@ -2344,6 +2558,35 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
 }

 /*
+generate a dense XTensor which allocates data on the buffer V2
+>> myOrder - order of the tensor
+>> myDimSize - the size of each dimension
+>> devID - device id
+>> myDataType - unit size (e.g., int, float, and double)
+>> myDenseRatio - how often an element has non-zero value
+
+*/
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType, const int devID)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+    memcpy(dims, myDimSize, sizeof(int) * myOrder);
+
+    dims[0] = -abs(dims[0]);
+
+    XTensor * tensor = NewTensor(myOrder, dims, myDataType, 1.0F, devID);
+
+    if (tensor->unitNum * tensor->unitSize == 176657664) {
+        tensor->Dump(stderr, "", 200);
+    }
+
+    XMem * myMem = GMems.GetMem(devID);
+    tensor->data = myMem->AllocBuf(myMem->devID, tensor->unitNum * tensor->unitSize);
+
+    return tensor;
+}
+
+/* 
 generate a XTensor which allocates data on the buffer 
 >> reference - reference tensor
 >> devID - device id
@@ -2359,6 +2602,17 @@ XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem)
 }

 /* 
+generate a XTensor which allocates data on the buffer V2
+>> reference - reference tensor
+>> devID - device id
+*/
+XTensor * NewTensorBufV2(const XTensor * reference, int devID)
+{
+    return NewTensorBufV2(reference->order, reference->dimSize, 
+                        reference->dataType, devID);
+}
+
+/* 
 generate a dense vector 
 >> num - number of entries
 >> myDataType - unit size (e.g., int, float, and double) 
@@ -2379,6 +2633,23 @@ XTensor * NewTensor1D(const int num,
 }

 /* 
+generate a dense vector V2
+>> num - number of entries
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensor1DV2(const int num, 
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[1];
+    dims[0] = num;
+
+    return NewTensorV2(1, dims, myDataType, myDevID);
+}
+
+/* 
 generate a dense matrix
 >> rowNum - number of rows
 >> colNum - number of colums
@@ -2401,6 +2672,25 @@ XTensor * NewTensor2D(const int rowNum, const int colNum,
 }

 /* 
+generate a dense matrix V2
+>> rowNum - number of rows
+>> colNum - number of colums
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensor2DV2(const int rowNum, const int colNum,
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[2];
+    dims[0] = rowNum;
+    dims[1] = colNum;
+
+    return NewTensorV2(2, dims, myDataType, myDevID);
+}
+
+/* 
 generate a dense 3d tensor 
 >> d0 - size of dimension 0
 >> d1 - size of dimension 1
@@ -2425,6 +2715,27 @@ XTensor * NewTensor3D(const int d0, const int d1, const int d2,
 }

 /* 
+generate a dense 3d tensor V2
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2,
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[3];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+
+    return NewTensorV2(3, dims, myDataType, myDevID);
+}
+
+/* 
 generate a dense 4d tensor 
 >> d0 - size of dimension 0
 >> d1 - size of dimension 1
@@ -2451,6 +2762,29 @@ XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
 }

 /* 
+generate a dense 4d tensor V2
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> d3 - size of dimension 3
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[4];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+    dims[3] = d3;
+
+    return NewTensorV2(4, dims, myDataType, myDevID);
+}
+
+/* 
 generate a dense 5d tensor 
 >> d0 - size of dimension 0
 >> d1 - size of dimension 1
@@ -2479,6 +2813,31 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
 }

 /* 
+generate a dense 5d tensor V2
+>> d0 - size of dimension 0
+>> d1 - size of dimension 1
+>> d2 - size of dimension 2
+>> d3 - size of dimension 3
+>> d4 - size of dimension 4
+>> myDataType - unit size (e.g., int, float, and double) 
+>> myDevID - when myMem is NULL, myDevID specifies the device 
+             on which we allocate the data on site.
+*/
+
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType, const int myDevID)
+{
+    int dims[5];
+    dims[0] = d0;
+    dims[1] = d1;
+    dims[2] = d2;
+    dims[3] = d3;
+    dims[4] = d4;
+
+    return NewTensorV2(5, dims, myDataType, myDevID);
+}
+
+/* 
 generate a copy of XTensor 
 >> a - the tensor we copy from
 >> isFilledData - indicates whether we allocate the data for

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -255,6 +255,10 @@ public:
    static
    bool IsSameShaped(const XTensor * a, const XTensor * b, const XTensor * c);

+    /* judge whether b is the reduced shape of a ?? */
+    static
+    bool IsReduceShaped(const XTensor * a, const XTensor * b, int dim);
+
    /* set the size of each dimension */
    void SetDim(int * myDimSize);

@@ -447,29 +451,57 @@ void InitTensor(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);
 				
+/* initialize a dense XTensor V2 */
+void InitTensorV2(XTensor * tensor,
+                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                const int myDevID = -1);
+
 /* initialize a dense vector */
 void InitTensor1D(XTensor * tensor, const int num, 
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

+/* initialize a dense vector V2 */
+void InitTensor1DV2(XTensor * tensor, const int num, 
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* initialize a dense matrix */
 void InitTensor2D(XTensor * tensor, const int rowNum, const int colNum,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

+/* initialize a dense matrix V2 */
+void InitTensor2DV2(XTensor * tensor, const int rowNum, const int colNum,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* initialize a dense 3d tensor */
 void InitTensor3D(XTensor * tensor, const int d0, const int d1, const int d2,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

+/* initialize a dense 3d tensor V2 */
+void InitTensor3DV2(XTensor * tensor, const int d0, const int d1, const int d2,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+    
 /* initialize a dense 4d tensor */
 void InitTensor4D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

+/* initialize a dense 4d tensor V2 */
+void InitTensor4DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* initialize a dense 5d tensor */
 void InitTensor5D(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, XMem * myMem = NULL);

+/* initialize a dense 5d tensor V2 */
+void InitTensor5DV2(XTensor * tensor, const int d0, const int d1, const int d2, const int d3, const int d4,
+                  const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* initialize a tensor with a reference tensor */
 void InitTensor(XTensor * tensor, const XTensor * reference);

+/* initialize a tensor with a reference tensor */
+void InitTensorV2(XTensor * tensor, const XTensor * reference);
+    
 /* initialize a tensor on the CPU with a reference tensor */
 void InitTensorOnCPU(XTensor * tensor, const XTensor * reference);
    
@@ -480,38 +512,72 @@ XTensor * NewTensor();
 XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
                    const float myDenseRatio = 1.0F, const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense XTensor V2 */
+XTensor * NewTensorV2(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType = X_FLOAT,
+                      const int myDevID = -1);
+
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const int myOrder, const int * myDimSize,
                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const float myDenseRatio = 1.0F,
                       const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,
+                       const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* generate a XTensor which allocates data on the buffer */
 XTensor * NewTensorBuf(const XTensor * reference, int devID, XMem * myMem);

+/* generate a XTensor which allocates data on the buffer V2 */
+XTensor * NewTensorBufV2(const XTensor * reference, int devID);
+
 /* generate a dense vector */
 XTensor * NewTensor1D(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1, 
                      XMem * myMem = NULL);

+/* generate a dense vector V2 */
+XTensor * NewTensor1DV2(const int num, const TENSOR_DATA_TYPE myDataType = X_FLOAT, const int myDevID = -1);
+
 /* generate a dense matrix */
 XTensor * NewTensor2D(const int rowNum, const int colNum, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense matrix V2 */
+XTensor * NewTensor2DV2(const int rowNum, const int colNum, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
+
 /* generate a dense 3d tensor */
 XTensor * NewTensor3D(const int d0, const int d1, const int d2, 
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense 3d tensor V2 */
+XTensor * NewTensor3DV2(const int d0, const int d1, const int d2, 
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
+
 /* generate a dense 4d tensor */
 XTensor * NewTensor4D(const int d0, const int d1, const int d2, const int d3,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense 4d tensor V2 */
+XTensor * NewTensor4DV2(const int d0, const int d1, const int d2, const int d3,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
+
 /* generate a dense 5d tensor */
 XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, const int d4,
                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
                      const int myDevID = -1, XMem * myMem = NULL);

+/* generate a dense 5d tensor V2 */
+XTensor * NewTensor5DV2(const int d0, const int d1, const int d2, const int d3, const int d4,
+                      const TENSOR_DATA_TYPE myDataType = X_FLOAT, 
+                      const int myDevID = -1);
+
 /* generate a copy of XTensor (with a reference to a given tensor) */
 XTensor * NewTensor(const XTensor * a, bool isFilledData = true);


--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{

 #endif

-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);

 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:

    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;

    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -97,4 +97,5 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"

+#include "../function/DropoutWithIndex.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -218,4 +218,55 @@ XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
    return c;
 }

+/*
+element-wise division of two tensors
+
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetDivDimIndex(a, b);
+
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Div function */
+        _Div(&a, &b, &c, 0, leadingDim);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIV);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _DivDim function */
+        _DivDim(&a, &b, &c, n, alpha);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Div(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

+/*
+element-wise division of two tensors:
+c(i) = a(i)/b(i) + \alpha * c(i)
+where i is the index of the element
+*/
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __DIV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -171,4 +171,35 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
    return c;
 }

+/*
+tensor division
+
+c = a / b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put result. we save it in a if c is NULL
+>> n - the dimension index
+>> alpha - the scaling factor
+>> requireLink - if add operation to network
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Div function */
+    _DivDim(&a, &b, &c, n, alpha);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
+    
 }
--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -53,6 +53,14 @@ we make a new tensor c to keep the result and return it
 */
 XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha = (DTYPE)0.0);

+/* 
+tensor division of two tensors:
+c(i) = a/b + \alpha * c
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is divided with b by broadcasting 
+*/
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+    
 } // namespace nts(NiuTrans.Tensor)

 #endif // __DIVDIM_H__
--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XTensor.h"
+#include "../../XName.h"
+#include "../../XUtility.h"
+#include "Mask.h"
+#include "Mask.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+mask entries of a given tensor:
+c(i) = a(i) if mask(i) is non-zero
+c(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors(a->unitNum == mask->unitNum && a->unitNum == c->unitNum,
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors(a->dataType == mask->dataType && a->dataType == c->dataType,
+    //    "Unmatched tensors in addition!");
+
+    if (a->devID >= 0 || mask->devID >= 0 || c->devID >= 0) {
+#ifdef USE_CUDA
+        if (a == c) {
+            int P2PAccesible = 0;
+#ifdef CUDA_UVA
+            cudaDeviceCanAccessPeer(&P2PAccesible, a->devID, b->devID);
+#endif
+            if ((a->devID < 0 && mask->devID >= 0) ||
+                (a->devID >= 0 && mask->devID < 0) ||
+                (a->devID >= 0 && mask->devID >= 0 && a->devID != mask->devID && !P2PAccesible))
+            {
+                ShowNTErrors("Cannot run this method on multiple devices simultaneously!");
+            }
+            else
+                _CudaMask(a, mask, c, alpha);
+        }
+        else
+            _CudaMask(a, mask, c, alpha);
+
+#endif
+    }
+    else {
+        if (!a->isSparse && !mask->isSparse) {
+            CheckNTErrors(!c->isSparse, "Illegal use of sparse tensor in addition!");
+
+            if (a->dataType == DEFAULT_DTYPE &&
+                mask->dataType == X_INT &&
+                c->dataType == DEFAULT_DTYPE)
+            {
+                DTYPE * ap = (DTYPE*)a->data;
+                int * maskp = (int*)mask->data;
+                DTYPE * cp = (DTYPE*)c->data;
+
+                /* unrolling */
+                int num = a->unitNum;
+                if (num % 2 == 0) {
+                    for (int i = 0; i < num; i += 2) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+
+                        if (maskp[i + 1] == 0) {
+                            cp[i + 1] = alpha;
+                        }
+                        else {
+                            cp[i + 1] = ap[i + 1];
+                        }
+                    }
+                }
+                else {
+                    for (int i = 0; i < num; i++) {
+                        if (maskp[i] == 0) {
+                            cp[i] = alpha;
+                        }
+                        else {
+                            cp[i] = ap[i];
+                        }
+                    }
+                }
+            }
+            else {
+                // TODO!!
+                ShowNTErrors("TODO!");
+            }
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
+{
+    _Mask(a, mask, a, alpha);
+}
+
+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
+{
+    XTensor c(&a);
+    c.SetTMPFlag();
+
+    /* call _Sum function */
+    _Mask(&a, &mask, &c, alpha);
+
+    /* tensor connections */
+    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
+    //XLink::AddParamToHead(&c, alpha);
+    // TODO!!
+    ShowNTErrors("TODO!");
+
+    return c;
+}
+
+}
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cu
+++ b/source/tensor/core/arithmetic/Mask.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#include "../../XDevice.h"
+#include "../../XUtility.h"
+#include "Sub.cuh"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/*
+mask entries of a given tensor (CUDA Kernel)
+c = a - b * \beta
+>> a - A matrix
+>> mask - mask matrix
+>> c - where we put masked a
+>> size - the size of a/b/c
+>> alpha - value
+*/
+__global__
+    void KernelMASK(DTYPE * a, int * mask, DTYPE * c, int size, DTYPE alpha)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+    if (i < size) {
+        if (mask[i] == 0) {
+            c[i] = alpha;
+        }
+        else {
+            c[i] = a[i];
+        }
+    }
+}
+
+/*
+mask entries of a given tensor (cuda version)
+>> a - a tensor
+>> mask - mask tensor
+>> c - where we put masked a
+>> alpha - value 
+*/
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha)
+{
+    CheckNTErrors(a && mask && c, "Empty tensor input!");
+    CheckNTErrors((a->unitNum == mask->unitNum && a->unitNum == c->unitNum),
+        "Unmatched tensors in addition!");
+    CheckNTErrors(mask->dataType == X_INT, "The mask tensor must be in X_INT!")
+    //CheckNTErrors((a->dataType == mask->dataType && a->dataType == c->dataType),
+    //    "Unmatched tensors in addition!");
+    CheckNTErrors((a->devID == mask->devID && a->devID == c->devID),
+        "The tensors must be on the same!");
+
+    int devIDBackup = XDevice::GetGPUDevice();
+    XDevice::SetGPUDevice(a->devID);
+
+    if (!a->isSparse && !mask->isSparse) {
+        CheckNTErrors(!c->isSparse, "Illegal use of sparse matrix in addition!");
+
+        if (a->dataType == DEFAULT_DTYPE &&
+            mask->dataType == X_INT &&
+            c->dataType == DEFAULT_DTYPE)
+        {
+            int gridSize[3], blockSize[3];
+
+            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
+            dim3 blocks(gridSize[0]);
+            dim3 threads(blockSize[0]);
+            KernelMASK << <blocks, threads >> >((DTYPE*)a->data, (int *)mask->data, (DTYPE*)c->data, a->unitNum, alpha);
+        }
+        else {
+            // TODO!!
+            ShowNTErrors("TODO!");
+        }
+    }
+    else {
+        // TODO!!
+        ShowNTErrors("TODO!");
+    }
+
+    XDevice::SetGPUDevice(devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.cuh
+++ b/source/tensor/core/arithmetic/Mask.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+* I'll attend several conferences and workshops in the following weeks -
+* busy days :(
+*/
+
+#ifndef __MASK_CUH__
+#define __MASK_CUH__
+
+#include "../../XTensor.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* mask entries of a given tensor (cuda version) */
+void _CudaMask(const XTensor * a, const XTensor * mask, XTensor * c = NULL, DTYPE alpha = (DTYPE)1.0);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __MASK_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -108,9 +108,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cBlockNum *= b->dimSizeRDI[i];
    }

-    XList * aList = new XList(10);
-    XList * bList = new XList(10);
-    XList * cList = new XList(10);
+    TensorList * aList = new TensorList(10);
+    TensorList * bList = new TensorList(10);
+    TensorList * cList = new TensorList(10);
    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
@@ -202,6 +202,42 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+{
+    if (!(a && b && c))
+        return false;
+
+    if(!(a->dataType == b->dataType && a->dataType == c->dataType))
+        return false;
+
+    if (!(a->order >= 2 && b->order >= 2 && c->order >= 2))
+        return false;
+
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a->order + b->order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a->order; i++)
+        dimSize[sub++] = a->dimSizeRDI[a->order + 1 - i];
+    for (int i = 2; i < b->order; i++)
+        dimSize[sub++] = b->dimSizeRDI[b->order + 1 - i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != c->dimSize[i])
+            return false;
+    }
+
+    return true;
+}
+
 /*
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
@@ -266,6 +302,53 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    return c;
 }

+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
+    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+
+    if (!c.isInit || !CheckMMulShape(&a, transposedA, &b, transposedB, &c)) {
+
+        int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+        int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+        int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+        int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+
+    }
+
+    /* call _MatrixMul function */
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, transposedA);
+        XLink::AddParamToHeadTrans(&c, transposedB);
+        XLink::AddParamToHead(&c, alpha);
+    }
+
+}
+
 /* 
 matrix multiplication with no transposition c = a * b * alpha
 >> a - tensor a
@@ -316,6 +399,52 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
    return c;
 }

+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
+    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+
+    if (!c.isInit || !CheckMMulShape(&a, X_NOTRANS, &b, X_NOTRANS, &c)) {
+
+        int an = a.dimSizeRDI[1];
+        int am = a.dimSizeRDI[0];
+        int bn = b.dimSizeRDI[1];
+        int bm = b.dimSizeRDI[0];
+
+        CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+        int order = a.order + b.order - 2;
+        int sub = 0;
+        int * dimSize = new int[order];
+        for (int i = 2; i < a.order; i++)
+            dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+        for (int i = 2; i < b.order; i++)
+            dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
+        dimSize[sub++] = an;
+        dimSize[sub++] = bm;
+
+        float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+        InitTensor(&c, order, dimSize, a.dataType, dr, a.devID, a.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+
+    }
+
+    /* call _MatrixMul function */
+    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+        XLink::AddParamToHead(&c, alpha);
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)



--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -59,10 +59,16 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
 XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+
 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

+void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
+    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,17 +38,23 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void _MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(TensorList * args)
 {
-    int x1 = *(int*)args->GetItem(0);
-    int y1 = *(int*)args->GetItem(1);
-    int x2 = *(int*)args->GetItem(2);
-    int y2 = *(int*)args->GetItem(3);
-    XTensor * a = (XTensor*)args->GetItem(4);
-    XTensor * b = (XTensor*)args->GetItem(5);
-    XTensor * c = (XTensor*)args->GetItem(6);
-    DTYPE alpha = *(DTYPE*)args->GetItem(7);
-    DTYPE beta = *(DTYPE*)args->GetItem(8);
+	CheckNTErrors(args->count == 2, "invalid argument number!");
+	IntList * indexArgs = (IntList*)args->GetItem(0);
+	TensorList * matrixArgs = (TensorList*)args->GetItem(1);
+	CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
+	CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
+
+    XTensor * a = matrixArgs->GetItem(0);
+    XTensor * b = matrixArgs->GetItem(1);
+    XTensor * c = matrixArgs->GetItem(2);
+    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
+    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
+	int x1 = indexArgs->GetItem(0);
+	int y1 = indexArgs->GetItem(1);
+	int x2 = indexArgs->GetItem(2);
+	int y2 = indexArgs->GetItem(3);

 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-void _MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(TensorList * args);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
+                          const TensorList * b, MATRIX_TRANS_TYPE transposedB,
+                          TensorList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty input lists!");
    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
 optimized for GPU
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB, 
+                          TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 /*
 matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -117,7 +117,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
        ShowNTErrors("Something is wrong!");
    }

-
    /* tensor connections */
    XLink::MakeLink(&x, &w, &b, &c, MATH_MULANDSHIFT);
    XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -219,4 +219,55 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    return c;
 }

+/*
+element-wise product of two tensors
+
+c(i) = a(i)*b(i) + \alpha * c(i)
+where i is the index of the item
+
+>> a - tensor a
+>> b - tensor b
+>> c - result tensor
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+>> requireLink - if add operation to network
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetMultiplyDimIndex(a, b);
+
+    if (n == -1) {
+        CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
+
+        /* call _Multiply function */
+        _Multiply(&a, &b, &c, 0, leadingDim);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
+            XLink::AddParamToHead(&c, alpha);
+            XLink::AddParamToHeadInt(&c, leadingDim);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _MultiplyDim function */
+        _MultiplyDim(&a, &b, &c, n, alpha);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, alpha);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -49,6 +49,13 @@ where i is the index of the element
 */
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0.0, int leadingDim = 0);

+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -170,6 +170,36 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
 }

 /*
+tensor multiplication
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a * b + \alpha * c. we save it in a if c is NULL
+>> n - the dimension index
+>> requireLink - if add operation to network
+*/
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Multiply function */
+    _MultiplyDim(&a, &b, &c, n, 0);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
+
+/* 
 tensor broadcast multiplication
 c = a * b + c * \beta 
 where some of dimensions of b can be of size 1
@@ -309,4 +339,30 @@ XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b)
    return c;
 }

+/* 
+tensor broadcast multiplication
+c = a * b + c * \beta 
+where some of dimensions of b can be of size 1
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> requireLink - if add operation to network
+*/
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumBroadcast function */
+    _MultiplyBroadcast(&a, &b, &c, 0);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
+        XLink::AddParamToHead(&c, 0);
+    }
+}
+
 }
--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -38,6 +38,10 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);

+/* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
+   i.e., a is multiplied with b by broadcasting */
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+
 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

@@ -45,6 +49,9 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
   we return the resulting tensor here */
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

+/* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MULTIPLYDIM_H__
--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -79,4 +79,25 @@ XTensor Negate(const XTensor & a)
    return b;
 }

+/*
+set every entry to its minus value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Negate(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Negate function */
+    _Negate(&a, &b);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Negate(const XTensor & a);

+/* set every entry to its minus value */
+void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -84,4 +84,25 @@ XTensor Sign(const XTensor & a)

    return b;
 }
+
+/*
+set every entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
+>> requireLink - if add operation to network
+*/
+void Sign(const XTensor & a, XTensor & b, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Sign function */
+    _Sign(&a, &b);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+    }
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Sign(const XTensor & a);

+/* set every entry to its sign value */
+void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -196,4 +196,47 @@ XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/*
+tensor subtraction c = a - b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetSubDimIndex(a, b);
+
+    if (n == -1) {
+        /* call _Sub function */
+        _Sub(&a, &b, &c, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUB);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SubDim function */
+        _SubDim(&a, &b, &c, n, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -42,6 +42,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor subtraction c = a - b * \beta */
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUB_H__
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -171,4 +171,35 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 	return c;
 }

+/*
+tensor subtraction
+
+c = a - b * \beta
+where the size of b is equal to the n-th dimension of a,
+i.e., a is subtracted with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a-b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Sub function */
+    _SubDim(&a, &b, &c, n, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+
 }
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -38,6 +38,10 @@ void _SubDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is subtracted with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);

+/* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is subtracted with b by broadcasting*/
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUBDIM_H__
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -201,4 +201,46 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/*
+tensor summation c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    int n = GetSumDimIndex(a, b);
+
+    if (n == -1) {
+        /* call _Sum function */
+        _Sum(&a, &b, &c, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUM);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else if (n >= 0 && n < a.order) {
+        /* call _SumDim function */
+        _SumDim(&a, &b, &c, n, beta);
+
+        if (requireLink) {
+            /* tensor connections */
+            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+            XLink::AddParamToHeadInt(&c, n);
+            XLink::AddParamToHead(&c, beta);
+        }
+    }
+    else {
+        ShowNTErrors("Something is wrong!");
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -41,6 +41,9 @@ make a new tensor c to keep the result and return it
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor summation c = a + b * \beta */
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUM_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -189,6 +189,37 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
 }

 /*
+tensor summation 
+
+c = a + b * \beta 
+where the size of b is equal to the n-th dimension of a, 
+i.e., a is summed with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> c - where we put a+b*\beta. we save it in a if c is NULL
+>> n - the dimension index
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumDim function */
+    _SumDim(&a, &b, &c, n, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
+        XLink::AddParamToHeadInt(&c, n);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+
+/* 
 tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
 c = a + b * \beta

@@ -329,4 +360,30 @@ XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta)
    return c;
 }

+/* 
+tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1
+c = a + b * \beta
+
+>> a - a tensor
+>> b - another tensor that would be broadcasted
+>> c - the resulting tensor
+>> beta - the scaling factor
+>> requireLink - if add operation to network
+*/
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _SumBroadcast function */
+    _SumBroadcast(&a, &b, &c, beta);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
+        XLink::AddParamToHead(&c, beta);
+    }
+}
+    
 }
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -42,6 +42,10 @@ void _SumDim(XTensor * a, const XTensor * b, int n, DTYPE beta = (DTYPE)1.0);
   i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
 XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.0);

+/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
+   i.e., a is summed with b by broadcasting */
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+
 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

@@ -49,6 +53,9 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
   we return the resulting tensor here */
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

+/* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+    
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SUMDIM_H__
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -201,9 +201,9 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULList(cublasHandle_t * handle,
-                            const XList * a, MATRIX_TRANS_TYPE transposedA,
-                            const XList * b, MATRIX_TRANS_TYPE transposedB,
-                            XList * c,
+                            const TensorList * a, MATRIX_TRANS_TYPE transposedA,
+                            const TensorList * b, MATRIX_TRANS_TYPE transposedB,
+                            TensorList * c,
                            int count, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -56,8 +56,8 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 /* matrix multiplication in batch mode via cuda version BLAS */
-void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
-                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const TensorList * a, MATRIX_TRANS_TYPE transposedA, 
+                            const TensorList * b, MATRIX_TRANS_TYPE transposedB, TensorList * c,
                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 #endif

--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
@@ -111,9 +111,10 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo

    onehot->SetZeroAll();

-#ifdef USE_CUDA
    float confidence = 1 - labelSmoothingP;
    float lowconfidence = labelSmoothingP / size;
+
+#ifdef USE_CUDA
    if(onehot->devID >= 0 && index->devID >= 0) {
        _CudaIndexToOnehot(index, onehot, size, confidence, lowconfidence);
        return;
@@ -129,8 +130,7 @@ void _IndexToOnehot(XTensor * index, XTensor * onehot, int size, float labelSmoo
    for (int i = 0; i < blockNum; i++) {
        int id = indexData[i];
        DTYPE * od = onehotData + i * stride;
-        od[id] = 2;
-        //onehotData[i * stride + id] = 1;
+        od[id] = 1;
    }

 }

--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
@@ -31,16 +31,31 @@ int scale(int x, int scale)
    return x * scale;
 }

+float scale(float x, float scale)
+{
+    return x * scale;
+}
+
 int descale(int x, int descale)
 {
    return x / descale;
 }

+float descale(float x, float descale)
+{
+    return x / descale;
+}
+
 int shift(int x, int shift)
 {
    return x + shift;
 }

+float shift(float x, float shift)
+{
+    return x + shift;
+}
+
 int mod(int x, int mod)
 {
    return x % mod;
@@ -48,7 +63,7 @@ int mod(int x, int mod)

 #ifdef USE_CUDA
 /* define three marco separately, specify the respective function names  (GPU mode) */
-#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)         \
+#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc)     \
 void _funcName(const XTensor * a, XTensor * b, int num)                     \
 {                                                                           \
    /* run it on GPUs */                                                    \
@@ -58,82 +73,188 @@ void _funcName(const XTensor * a, XTensor * b, int num)                     \
    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_INT), "TODO!");                         \
+    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
    int * d = (int*)a->data;                                                \
    int * db = (int*)b->data;                                               \
    for (int i = 0; i < a->unitNum; i++)                                    \
        db[i] = (int)origFunc(d[i], num);                                   \
+}                                                                           \
+
+#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)         \
+void _funcName(const XTensor * a, XTensor * b, float num)                   \
+{                                                                           \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b, num);                                           \
+        return;                                                             \
+    }                                                                       \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                "Input tensors should have the same data type!");           \
+    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
+    float * d = (float*)a->data;                                            \
+    float * db = (float*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (float)origFunc(d[i], num);                                 \
 }

-#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
+#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
 void funcName(XTensor &a, int num)                                          \
 {                                                                           \
    _funcName(&a, &a, num);                                                 \
-}
+}                                                                           \

-#define SIMPLE_BINARY_FUNCTION(funcName, _funcName)                         \
+#define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
+void funcName(XTensor &a, float num)                                        \
+{                                                                           \
+    _funcName(&a, &a, num);                                                 \
+}                                                                           \
+    
+#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
 void funcName(const XTensor &a, XTensor &b, int num)                        \
 {                                                                           \
    _funcName(&a, &b, num);                                                 \
-}
+}                                                                           \

-_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScale, scale)
-SIMPLE_BINARY_FUNCTION_ME(Scale, _Scale)
-SIMPLE_BINARY_FUNCTION(Scale, _Scale)
+#define SIMPLE_BINARY_FUNCTION(funcName, _funcName, operationId)            \
+XTensor funcName(const XTensor &a, float num)                               \
+{                                                                           \
+    XTensor b(&a);                                                          \
+    b.SetTMPFlag();                                                         \
+    _funcName(&a, &b, num);                                                 \
+    XLink::MakeLink(&a, NULL, &b, operationId);                             \
+    return b;                                                               \
+}                                                                           \

-_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescale, descale)
-SIMPLE_BINARY_FUNCTION_ME(Descale, _Descale)
-SIMPLE_BINARY_FUNCTION(Descale, _Descale)
+#define SIMPLE_BINARY_FUNCTION_VOID(funcName, _funcName, operationId)       \
+void funcName(const XTensor &a, XTensor &b, float num, bool requireLink)    \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b, num);                                                 \
+    if (requireLink) {                                                      \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}                                                                           \

-_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShift, shift)
-SIMPLE_BINARY_FUNCTION_ME(Shift, _Shift)
-SIMPLE_BINARY_FUNCTION(Shift, _Shift)
+_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
+SIMPLE_BINARY_FUNCTION_ME_INT(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)
+
+_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
+SIMPLE_BINARY_FUNCTION_ME(_ScaleMe, _Scale)
+SIMPLE_BINARY_FUNCTION(Scale, _Scale, MATH_SCALE)
+SIMPLE_BINARY_FUNCTION_VOID(Scale, _Scale, MATH_SCALE)
+
+_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
+SIMPLE_BINARY_FUNCTION_ME_INT(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
+
+_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
+SIMPLE_BINARY_FUNCTION_ME(_DescaleMe, _Descale)
+SIMPLE_BINARY_FUNCTION(Descale, _Descale, MATH_DESCALE)
+SIMPLE_BINARY_FUNCTION_VOID(Descale, _Descale, MATH_DESCALE)
+
+_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
+SIMPLE_BINARY_FUNCTION_ME_INT(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
+
+_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
+SIMPLE_BINARY_FUNCTION_ME(_ShiftMe, _Shift)
+SIMPLE_BINARY_FUNCTION(Shift, _Shift, MATH_SHIFT)
+SIMPLE_BINARY_FUNCTION_VOID(Shift, _Shift, MATH_SHIFT)

-_SIMPLE_BINARY_FUNCTION(_Mod, _CudaMod, mod)
-SIMPLE_BINARY_FUNCTION_ME(Mod, _Mod)
-SIMPLE_BINARY_FUNCTION(Mod, _Mod)
+_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(_ModMe, _Mod)
+SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)

 #else
 /* define three marco separately, specify the respective function names (CPU mode) */
-#define _SIMPLE_BINARY_FUNCTION(_funcName, origFunc)                        \
+#define _SIMPLE_BINARY_FUNCTION_INT(_funcName, _cudaFuncName, origFunc)     \
 void _funcName(const XTensor * a, XTensor * b, int num)                     \
 {                                                                           \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b, num);                                           \
+        return;                                                             \
+    }                                                                       \
    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
                "Input tensors should have the same data type!");           \
-    CheckNTErrors((a->dataType == X_INT), "TODO!");                         \
+    CheckNTErrors((a->dataType == X_INT&&b->dataType == X_INT), "TODO!");   \
    int * d = (int*)a->data;                                                \
    int * db = (int*)b->data;                                               \
    for (int i = 0; i < a->unitNum; i++)                                    \
        db[i] = (int)origFunc(d[i], num);                                   \
+}                                                                           \
+
+#define _SIMPLE_BINARY_FUNCTION(_funcName, _cudaFuncName, origFunc)         \
+void _funcName(const XTensor * a, XTensor * b, float num)                   \
+{                                                                           \
+    /* run it on GPUs */                                                    \
+    if (a->devID >= 0) {                                                    \
+        _cudaFuncName(a, b, num);                                           \
+        return;                                                             \
+    }                                                                       \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                "Input tensors should have the same data type!");           \
+    CheckNTErrors((a->dataType == X_FLOAT&&b->dataType == X_FLOAT), "TODO!");\
+    float * d = (float*)a->data;                                            \
+    float * db = (float*)b->data;                                           \
+    for (int i = 0; i < a->unitNum; i++)                                    \
+        db[i] = (float)origFunc(d[i], num);                                 \
 }

+#define SIMPLE_BINARY_FUNCTION_ME_INT(funcName, _funcName)                  \
+void funcName(XTensor &a, int num)                                          \
+{                                                                           \
+    _funcName(&a, &a, num);                                                 \
+}                                                                           \
+
 #define SIMPLE_BINARY_FUNCTION_ME(funcName, _funcName)                      \
-void funcName(XTensor & a, int num)                                         \
+void funcName(XTensor &a, float num)                                        \
 {                                                                           \
    _funcName(&a, &a, num);                                                 \
-}
+}                                                                           \
+
+#define SIMPLE_BINARY_FUNCTION_INT(funcName, _funcName)                     \
+void funcName(const XTensor &a, XTensor &b, int num)                        \
+{                                                                           \
+    _funcName(&a, &b, num);                                                 \
+}                                                                           \

 #define SIMPLE_BINARY_FUNCTION(funcName, _funcName)                         \
-void funcName(const XTensor & a, XTensor &b, int num)                       \
+void funcName(const XTensor &a, XTensor &b, float num)                      \
 {                                                                           \
    _funcName(&a, &b, num);                                                 \
-}
+}                                                                           \
+
+_SIMPLE_BINARY_FUNCTION_INT(_Scale, _CudaScale, scale)
+SIMPLE_BINARY_FUNCTION_ME_INT(Scale, _Scale)
+SIMPLE_BINARY_FUNCTION_INT(Scale, _Scale)

-_SIMPLE_BINARY_FUNCTION(_Scale, scale)
+_SIMPLE_BINARY_FUNCTION(_Scale, _CudaScaleFloat, scale)
 SIMPLE_BINARY_FUNCTION_ME(Scale, _Scale)
 SIMPLE_BINARY_FUNCTION(Scale, _Scale)

-_SIMPLE_BINARY_FUNCTION(_Descale, descale)
+_SIMPLE_BINARY_FUNCTION_INT(_Descale, _CudaDescale, descale)
+SIMPLE_BINARY_FUNCTION_ME_INT(Descale, _Descale)
+SIMPLE_BINARY_FUNCTION_INT(Descale, _Descale)
+
+_SIMPLE_BINARY_FUNCTION(_Descale, _CudaDescaleFloat, descale)
 SIMPLE_BINARY_FUNCTION_ME(Descale, _Descale)
 SIMPLE_BINARY_FUNCTION(Descale, _Descale)

-_SIMPLE_BINARY_FUNCTION(_Shift, shift)
+_SIMPLE_BINARY_FUNCTION_INT(_Shift, _CudaShift, shift)
+SIMPLE_BINARY_FUNCTION_ME_INT(Shift, _Shift)
+SIMPLE_BINARY_FUNCTION_INT(Shift, _Shift)
+
+_SIMPLE_BINARY_FUNCTION(_Shift, _CudaShiftFloat, shift)
 SIMPLE_BINARY_FUNCTION_ME(Shift, _Shift)
 SIMPLE_BINARY_FUNCTION(Shift, _Shift)

-_SIMPLE_BINARY_FUNCTION(_Mod, mod)
-SIMPLE_BINARY_FUNCTION_ME(Mod, _Mod)
-SIMPLE_BINARY_FUNCTION(Mod, _Mod)
+_SIMPLE_BINARY_FUNCTION_INT(_Mod, _CudaMod, mod)
+SIMPLE_BINARY_FUNCTION_ME_INT(Mod, _Mod)
+SIMPLE_BINARY_FUNCTION_INT(Mod, _Mod)

 #endif


--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
@@ -36,18 +36,36 @@ int cudascale(int x, int scale)
 }

 __device__
+float cudascale(float x, float scale)
+{
+    return x * scale;
+}
+
+__device__
 int cudadescale(int x, int descale)
 {
    return x / descale;
 }

 __device__
+float cudadescale(float x, float descale)
+{
+    return x / descale;
+}
+
+__device__
 int cudashift(int x, int shift)
 {
    return x + shift;
 }

 __device__
+float cudashift(float x, float descale)
+{
+    return x + descale;
+}
+
+__device__
 int cudamod(int x, int mod)
 {
    return x % mod;
@@ -92,9 +110,51 @@ void _Cuda##funcName(const XTensor * a, XTensor * b, int num)               \
    BacktoCudaDev(a->devID, devIDBackup);                                   \
 }                                                                           \

+#define SIMPLE_BINARY_FUNCTION_FLOAT_GPU(funcName, origFunc)                \
+__global__                                                                  \
+void Kernel##funcName(float * a, float * b, int size, float num)            \
+{                                                                           \
+    int i = blockDim.x * blockIdx.x + threadIdx.x;                          \
+                                                                            \
+    if (i < size)                                                           \
+        b[i] = (float)origFunc(a[i], num);                                  \
+}                                                                           \
+                                                                            \
+                                                                            \
+void _Cuda##funcName(const XTensor * a, XTensor * b, float num)             \
+{                                                                           \
+    CheckNTErrors((XTensor::IsSameShaped(a, b)),                            \
+                  "Input tensors should have the same type!");              \
+    CheckNTErrors((a->isSparse == false), "TODO!");                         \
+                                                                            \
+    int gridSize[3];                                                        \
+    int blockSize[3];                                                       \
+                                                                            \
+    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);         \
+                                                                            \
+    dim3 blocks(gridSize[0]);                                               \
+    dim3 threads(blockSize[0]);                                             \
+                                                                            \
+    int devIDBackup;                                                        \
+    ProtectCudaDev(a->devID, devIDBackup);                                  \
+                                                                            \
+    if (a->dataType == X_FLOAT) {                                           \
+        Kernel##funcName<<<blocks, threads>>>                               \
+                        ((float*)a->data, (float*)b->data, a->unitNum, num);\
+    }                                                                       \
+    else {                                                                  \
+        ShowNTErrors("TODO!");                                              \
+    }                                                                       \
+                                                                            \
+    BacktoCudaDev(a->devID, devIDBackup);                                   \
+}
+
 SIMPLE_BINARY_FUNCTION_GPU(Scale, cudascale)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ScaleFloat, cudascale)
 SIMPLE_BINARY_FUNCTION_GPU(Descale, cudadescale)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(DescaleFloat, cudadescale)
 SIMPLE_BINARY_FUNCTION_GPU(Shift, cudashift)
+SIMPLE_BINARY_FUNCTION_FLOAT_GPU(ShiftFloat, cudashift)
 SIMPLE_BINARY_FUNCTION_GPU(Mod, cudamod)

 #endif // USE_CUDA

--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
@@ -32,20 +32,29 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* scale each entry (CUDA Kernel) */
 __global__
 void KernelScale(int * a, int * b, int size, int scale);
+__global__
+void KernelScale(int * a, int * b, int size, float scale);
 /* scale each entry */
 void _CudaScale(const XTensor * a, XTensor * b, int scale);
+void _CudaScaleFloat(const XTensor * a, XTensor * b, float scale);

 /* descale each entry (CUDA Kernel) */
 __global__
 void KernelDescale(int * a, int * b, int size, int scale);
+__global__
+void KernelDescale(int * a, int * b, int size, float scale);
 /* descale each entry */
 void _CudaDescale(const XTensor * a, XTensor * b, int scale);
+void _CudaDescaleFloat(const XTensor * a, XTensor * b, float scale);

 /* shift each entry (CUDA Kernel) */
 __global__
 void KernelShift(int * a, int * b, int size, int shift);
+__global__
+void KernelShift(int * a, int * b, int size, float shift);
 /* shift each entry */
 void _CudaShift(const XTensor * a, XTensor * b, int shift);
+void _CudaShiftFloat(const XTensor * a, XTensor * b, float shift);

 /* mod each entry (CUDA Kernel) */
 __global__

--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
@@ -37,51 +37,76 @@ void _Scale(const XTensor * a, XTensor * b, float scale);
 scale up tensor entires (on site)
 b = a * scale
 */
-void Scale(XTensor & a, int scale);
-void Scale(XTensor & a, float scale);
+void _ScaleMe(XTensor & a, int scale);
+void _ScaleMe(XTensor & a, float scale);
    
 /*
 scale up tensor entires
 b = a * scale
 */
 void Scale(const XTensor & a, XTensor &b, int scale);
-void Scale(const XTensor & a, XTensor &b, float scale);
+void Scale(const XTensor & a, XTensor &b, float scale, bool requireLink = false);
+
+/*
+scale up tensor entires (return an XTensor structure)
+b = a * scale
+*/
+XTensor Scale(const XTensor & a, float scale);

 /*
 descale tensor entires
 b = a / scale
 */
 void _Descale(const XTensor * a, XTensor * b, int scale);
+void _Descale(const XTensor * a, XTensor * b, float scale);

 /*
 descale tensor entires (on site)
 b = a / scale
 */
-void Descale(XTensor & a, int scale);
+void _DescaleMe(XTensor & a, int scale);
+void _DescaleMe(XTensor & a, float scale);
    
 /*
 descale tensor entires
 b = a / scale
 */
 void Descale(const XTensor & a, XTensor & b, int scale);
+void Descale(const XTensor & a, XTensor & b, float scale, bool requireLink = false);
+
+/*
+descale tensor entires (return an XTensor structure)
+b = a / scale
+*/
+XTensor Descale(const XTensor & a, float scale);

 /*
 shift tensor entires
 b = a + shift
 */
 void _Shift(const XTensor * a, XTensor * b, int shift);
+void _Shift(const XTensor * a, XTensor * b, float shift);

 /*
 shift tensor entires (on site)
 b = a + shift
 */
-void Shift(XTensor & a, int shift);
+void _ShiftMe(XTensor & a, int shift);
+void _ShiftMe(XTensor & a, float shift);
    
 /*
 shift tensor entires
 b = a + shift
 */
 void Shift(const XTensor & a, XTensor & b, int shift);
+void Shift(const XTensor & a, XTensor & b, float shift, bool requireLink = false);
+
+/*
+shift tensor entires (return an XTensor structure)
+b = a + shift
+*/
+XTensor Shift(const XTensor & a, float shift);
+

 /*
 mod tensor entires
@@ -93,7 +118,7 @@ void _Mod(const XTensor * a, XTensor * b, int base);
 mod tensor entires (on site)
 b = a % mod
 */
-void Mod(XTensor & a, int base);
+void _ModMe(XTensor & a, int base);
    
 /*
 mod tensor entires

--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
@@ -94,6 +94,23 @@ XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper)
 	return b;
 }

+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Clip function */
+    _Clip(&a, &b, lower, upper);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_CLIP);
+        XLink::AddParamToHead(&b, lower);
+        XLink::AddParamToHead(&b, upper);
+    }
+}
+
 /*
 backward computation


--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
@@ -37,6 +37,8 @@ void _ClipMe(XTensor * a, DTYPE lower, DTYPE upper);
   make a new tensor to keep the result and return it */
 XTensor Clip(const XTensor & a, DTYPE lower, DTYPE upper);

+void Clip(const XTensor & a, XTensor & b, DTYPE lower, DTYPE upper, bool requireLink = false);
+
 /*
 backward of Clip function
 */

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -138,12 +138,12 @@ XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTen
    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);

    /* tensor connections */
-    XList list(5);
-    list.Add(&input);
-    list.Add(&mean);
-    list.Add(&var);
-    list.Add(&a);
-    list.Add(&b);
+    TensorList list(5);
+    list.Add((XTensor*)&input);
+    list.Add((XTensor*)&mean);
+    list.Add((XTensor*)&var);
+    list.Add((XTensor*)&a);
+    list.Add((XTensor*)&b);
    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, epsilon);

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -102,4 +102,27 @@ XTensor Power(const XTensor & a, DTYPE p)
    return b;
 }

+/*
+get the power(a, p)
+>> a - input tensor
+>> b - output tensor
+>> p - parameter
+>> requireLink - if add operation to network
+*/
+void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _Power function */
+    _Power(&a, &b, p);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_POWER);
+        XLink::AddParamToHead(&b, p);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -41,6 +41,9 @@ make a new tensor to keep the result and return it
 */
 XTensor Power(const XTensor & a, DTYPE p);

+/* get the power(x, y) */
+void Power(const XTensor & a, XTensor & b, DTYPE p, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __POWER_H__
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -118,4 +118,33 @@ XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
    return b;
 }

+/* 
+scale and shift all tensor entires
+
+b = a * scale + shift
+
+>> a - the input tensor
+>> b - the output tensor
+>> scale - the scaler factor
+>> shift - the shift factor
+>> requireLink - if add operation to network
+*/
+void ScaleAndShift(const XTensor & a, XTensor & b, DTYPE scale, DTYPE shift, bool requireLink)
+{
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
+        InitTensor(&b, &a);
+    }
+
+    /* call _ScaleAndShift function */
+    _ScaleAndShift(&a, &b, scale, shift);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, MATH_SCALEANDSHIFT);
+        XLink::AddParamToHead(&b, scale);
+        XLink::AddParamToHead(&b, shift);
+    }
+}
+
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -50,6 +50,12 @@ b = a * scale + shift
 */
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);

+/* 
+scale and shift all tensor entires 
+b = a * scale + shift 
+*/
+void ScaleAndShift(const XTensor &a, XTensor &b, DTYPE scale, DTYPE shift = 0, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SCALEANDSHIFT_H__
\ No newline at end of file
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
@@ -82,58 +82,82 @@ XTensor funcName(const XTensor &a)                                          \
    return b;                                                               \
 }

+#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
+void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b);                                                      \
+    if (requireLink) {                                                      \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}
+
 _SIMPLE_UNARY_FUNCTION(_Absolute, _CudaAbsolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
+SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)

 _SIMPLE_UNARY_FUNCTION(_Ceil, _CudaCeil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)

 _SIMPLE_UNARY_FUNCTION(_Exp, _CudaExp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)

 _SIMPLE_UNARY_FUNCTION(_Floor, _CudaFloor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)

 _SIMPLE_UNARY_FUNCTION(_IsNonZero, _CudaIsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)

 _SIMPLE_UNARY_FUNCTION(_IsZero, _CudaIsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)

 _SIMPLE_UNARY_FUNCTION(_Log, _CudaLog, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)

 _SIMPLE_UNARY_FUNCTION(_Round, _CudaRound, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)

 _SIMPLE_UNARY_FUNCTION(_Sqrt, _CudaSqrt, sqrt)
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)

 _SIMPLE_UNARY_FUNCTION(_Square, _CudaSquare, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
-
+SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)

 _SIMPLE_UNARY_FUNCTION(_Sin, _CudaSin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)

 _SIMPLE_UNARY_FUNCTION(_Cos, _CudaCos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)

 _SIMPLE_UNARY_FUNCTION(_Tan, _CudaTan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)

 #else
 /* define three marco separately, specify the respective function names (CPU mode) */
@@ -164,59 +188,82 @@ XTensor funcName(const XTensor &a)                                          \
    XLink::MakeLink(&a, NULL, &b, operationId);                             \
    return b;                                                               \
 }
+#define SIMPLE_UNARY_FUNCTION_VOID(funcName, _funcName, operationId)        \
+void funcName(const XTensor &a, XTensor &b, bool requireLink)               \
+{                                                                           \
+    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {                      \
+        InitTensor(&b, &a);                                                 \
+    }                                                                       \
+    _funcName(&a, &b);                                                      \
+    if (requireLink) {                                                      \
+        XLink::MakeLink(&a, NULL, &b, operationId);                         \
+    }                                                                       \
+}

 _SIMPLE_UNARY_FUNCTION(_Absolute, fabs)
 _SIMPLE_UNARY_FUNCTION_ME(_AbsoluteMe, _Absolute)
 SIMPLE_UNARY_FUNCTION(Absolute, _Absolute, MATH_ABSOLUTE)
-
+SIMPLE_UNARY_FUNCTION_VOID(Absolute, _Absolute, MATH_ABSOLUTE)

 _SIMPLE_UNARY_FUNCTION(_Ceil, ceil)
 _SIMPLE_UNARY_FUNCTION_ME(_CeilMe, _Ceil)
 SIMPLE_UNARY_FUNCTION(Ceil, _Ceil, MATH_CEIL)
+SIMPLE_UNARY_FUNCTION_VOID(Ceil, _Ceil, MATH_CEIL)

 _SIMPLE_UNARY_FUNCTION(_Exp, exp)
 _SIMPLE_UNARY_FUNCTION_ME(_ExpMe, _Exp)
 SIMPLE_UNARY_FUNCTION(Exp, _Exp, MATH_EXP)
+SIMPLE_UNARY_FUNCTION_VOID(Exp, _Exp, MATH_EXP)

 _SIMPLE_UNARY_FUNCTION(_Floor, floor)
 _SIMPLE_UNARY_FUNCTION_ME(_FloorMe, _Floor)
 SIMPLE_UNARY_FUNCTION(Floor, _Floor, MATH_FLOOR)
+SIMPLE_UNARY_FUNCTION_VOID(Floor, _Floor, MATH_FLOOR)

 _SIMPLE_UNARY_FUNCTION(_IsNonZero, isnonzero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsNonZeroMe, _IsNonZero)
 SIMPLE_UNARY_FUNCTION(IsNonZero, _IsNonZero, MATH_ISNONZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsNonZero, _IsNonZero, MATH_ISNONZERO)

 _SIMPLE_UNARY_FUNCTION(_IsZero, iszero)
 _SIMPLE_UNARY_FUNCTION_ME(_IsZeroMe, _IsZero)
 SIMPLE_UNARY_FUNCTION(IsZero, _IsZero, MATH_ISZERO)
+SIMPLE_UNARY_FUNCTION_VOID(IsZero, _IsZero, MATH_ISZERO)

 _SIMPLE_UNARY_FUNCTION(_Log, log)
 _SIMPLE_UNARY_FUNCTION_ME(_LogMe, _Log)
 SIMPLE_UNARY_FUNCTION(Log, _Log, MATH_LOG)
+SIMPLE_UNARY_FUNCTION_VOID(Log, _Log, MATH_LOG)

 _SIMPLE_UNARY_FUNCTION(_Round, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)
 SIMPLE_UNARY_FUNCTION(Round, _Round, MATH_ROUND)
+SIMPLE_UNARY_FUNCTION_VOID(Round, _Round, MATH_ROUND)

 _SIMPLE_UNARY_FUNCTION(_Sqrt, sqrt)
 _SIMPLE_UNARY_FUNCTION_ME(_SqrtMe, _Sqrt)
 SIMPLE_UNARY_FUNCTION(Sqrt, _Sqrt, MATH_SQRT)
+SIMPLE_UNARY_FUNCTION_VOID(Sqrt, _Sqrt, MATH_SQRT)

 _SIMPLE_UNARY_FUNCTION(_Square, square)
 _SIMPLE_UNARY_FUNCTION_ME(_SquareMe, _Square)
 SIMPLE_UNARY_FUNCTION(Square, _Square, MATH_SQUARE)
+SIMPLE_UNARY_FUNCTION_VOID(Square, _Square, MATH_SQUARE)

 _SIMPLE_UNARY_FUNCTION(_Sin, sin)
 _SIMPLE_UNARY_FUNCTION_ME(_SinMe, _Sin)
 SIMPLE_UNARY_FUNCTION(Sin, _Sin, MATH_SIN)
+SIMPLE_UNARY_FUNCTION_VOID(Sin, _Sin, MATH_SIN)

 _SIMPLE_UNARY_FUNCTION(_Cos, cos)
 _SIMPLE_UNARY_FUNCTION_ME(_CosMe, _Cos)
 SIMPLE_UNARY_FUNCTION(Cos, _Cos, MATH_COS)
+SIMPLE_UNARY_FUNCTION_VOID(Cos, _Cos, MATH_COS)

 _SIMPLE_UNARY_FUNCTION(_Tan, tan)
 _SIMPLE_UNARY_FUNCTION_ME(_TanMe, _Tan)
 SIMPLE_UNARY_FUNCTION(Tan, _Tan, MATH_TAN)
+SIMPLE_UNARY_FUNCTION_VOID(Tan, _Tan, MATH_TAN)

 /*_SIMPLE_UNARY_FUNCTION(_Round, round)
 _SIMPLE_UNARY_FUNCTION_ME(_RoundMe, _Round)

--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
@@ -34,6 +34,8 @@ void _AbsoluteMe(XTensor * a);
 /* set every entry to its absolute value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Absolute(const XTensor & a);
+/* set every entry to its absolute value */
+void Absolute(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its ceil value */
 void _Ceil(const XTensor * a, XTensor * b);
@@ -43,6 +45,8 @@ void _CeilMe(XTensor * a);
 /* set every entry to its ceil value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Ceil(const XTensor & a);
+/* set every entry to its ceil value */
+void Ceil(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its exponent value */
 void _Exp(const XTensor * a, XTensor * b);
@@ -52,6 +56,8 @@ void _ExpMe(XTensor * a);
 /* set every entry to its exponent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Exp(const XTensor & a);
+/* set every entry to its exponent value */
+void Exp(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its floor value */
 void _Floor(const XTensor * a, XTensor * b);
@@ -61,6 +67,8 @@ void _FloorMe(XTensor * a);
 /* set every entry to its floor value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Floor(const XTensor & a);
+/* set every entry to its floor value */
+void Floor(const XTensor & a, XTensor & b, bool requireLink = false);

 /* if source entry is non-zero, set target entry to be one, otherwise zero */
 void _IsNonZero(const XTensor *a, XTensor *b);
@@ -70,6 +78,8 @@ void _IsNonZeroMe(XTensor *a);
 /* if source entry is non-zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsNonZero(const XTensor &a);
+/* if source entry is non-zero, set target entry to be one, otherwise zero */
+void IsNonZero(const XTensor &a, XTensor & b, bool requireLink = false);

 /* if source entry is zero, set target entry to be one, otherwise zero */
 void _IsZero(const XTensor *a, XTensor *b);
@@ -79,6 +89,8 @@ void _IsZeroMe(XTensor *a);
 /* if source entry is zero, set target entry to be one, otherwise zero (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor IsZero(const XTensor &a);
+/* if source entry is zero, set target entry to be one, otherwise zero */
+void IsZero(const XTensor &a, XTensor & b, bool requireLink = false);

 /* set every entry to its logarithm value */
 void _Log(const XTensor * a, XTensor * b);
@@ -88,6 +100,8 @@ void _LogMe(XTensor * a);
 /* set every entry to its logarithm value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Log(const XTensor & a);
+/* set every entry to its logarithm value */
+void Log(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its round value */
 void _Round(const XTensor * a, XTensor * b);
@@ -97,6 +111,8 @@ void _RoundMe(XTensor * a);
 /* set every entry to its round value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Round(const XTensor & a);
+/* set every entry to its round value */
+void Round(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its sqrt value */
 void _Sqrt(const XTensor * a, XTensor * b);
@@ -106,6 +122,8 @@ void _SqrtMe(XTensor * a);
 /* set every entry to its sqrt value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sqrt(const XTensor & a);
+/* set every entry to its sqrt value */
+void Sqrt(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its square value */
 void _Square(const XTensor * a, XTensor * b);
@@ -115,6 +133,8 @@ void _SquareMe(XTensor * a);
 /* set every entry to its square value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Square(const XTensor & a);
+/* set every entry to its square value */
+void Square(const XTensor & a, XTensor & b, bool requireLink = false);


 /* set every entry to its sine value */
@@ -125,6 +145,8 @@ void _SinMe(XTensor * a);
 /* set every entry to its sine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Sin(const XTensor & a);
+/* set every entry to its sine value */
+void Sin(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its cosine value */
 void _Cos(const XTensor * a, XTensor * b);
@@ -134,6 +156,8 @@ void _CosMe(XTensor * a);
 /* set every entry to its cosine value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Cos(const XTensor & a);
+/* set every entry to its cosine value */
+void Cos(const XTensor & a, XTensor & b, bool requireLink = false);

 /* set every entry to its tangent value */
 void _Tan(const XTensor * a, XTensor * b);
@@ -143,6 +167,8 @@ void _TanMe(XTensor * a);
 /* set every entry to its tangent value (return an XTensor structure)
 make a new tensor to keep the result and return it */
 XTensor Tan(const XTensor & a);
+/* set every entry to its tangent value */
+void Tan(const XTensor & a, XTensor & b, bool requireLink = false);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -229,10 +229,10 @@ XTensor CopyIndexed(const XTensor & s, int dim,
    /* call _CopyIndexed function */
    _CopyIndexed(&s, &t, dim, &srcIndex, &tgtIndex, copyNum);

-    XList list(3);
-    list.Add(&s);
-    list.Add(&srcIndex);
-    list.Add(&tgtIndex);
+    TensorList list(3);
+    list.Add((XTensor*)&s);
+    list.Add((XTensor*)&srcIndex);
+    list.Add((XTensor*)&tgtIndex);

    /* tensor connection */
    XLink::MakeLink(&list, &t, MOVEMENT_COPYINDEXED);

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -131,4 +131,43 @@ XTensor ReduceMax(const XTensor &input, int dim)
    return output;
 }

+/* 
+get the max value of the items along a dimension of the tensor
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> requireLink - if add operation to network
+*/
+void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+    
+    /* call _ReduceMax function */
+    _ReduceMax(&input, &output, dim);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -35,6 +35,9 @@ make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim);

+/* get the max value of the items along a dimension of the tensor. */
+void ReduceMax(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __REDUCEMAX_H__
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -86,4 +86,45 @@ XTensor ReduceMean(const XTensor &input, int dim)
    return output;
 }

+/* 
+get the mean value along a dimension of the tensor
+
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> requireLink - if add operation to network
+*/
+void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _ReduceMean function */
+    _ReduceMean(&input, &output, dim);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -39,6 +39,12 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
 XTensor ReduceMean(const XTensor &input, int dim);

+/* 
+get the mean value along a dimension of the tensor
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+*/
+void ReduceMean(const XTensor &input, XTensor &output, int dim, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __REDUCEMEAN_H__
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -244,6 +244,39 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    return output;
 }

+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power, bool isExp, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, &shift, power, isExp);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }
+}
+
 /* 
 sum the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -290,4 +323,52 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
    return output;
 }

+/* 
+sum the items along a dimension of the tensor
+
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift)^power if isExp == false
+sum = \sum_i exp((a_i - shift)^power) if isExp == true
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> shift - shift the input
+>> ieExp - specify if the exp() is performed
+>> power - we perform pow(item_i, power) on each item in the array
+>> requireLink - if add operation to network
+*/
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power, bool isExp, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, NULL, power, isExp);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
+        XLink::AddParamToHeadInt(&output, dim);
+        XLink::AddParamToHead(&output, power);
+        XLink::AddParamToHeadBool(&output, isExp);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -44,6 +44,8 @@ sum = \sum_i exp(a_i - shift) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);

+void ReduceSum(const XTensor &input, XTensor &output, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+
 /* 
 sum the items along a dimension of the tensor (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -53,6 +55,14 @@ sum = \sum_i exp(a_i) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);

+/* 
+sum the items along a dimension of the tensor
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift) if isExp == false
+sum = \sum_i exp(a_i - shift) if isExp == true
+*/
+void ReduceSum(const XTensor &input, XTensor &output, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __REDUCESUM_H__
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -82,4 +82,46 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
    return output;
 }

+/* 
+squared sum of the items along a dimension of the tensor
+
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> shift - bias on the input
+>> requireLink - if add operation to network
+*/
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _ReduceSumSquared function */
+    _ReduceSumSquared(&input, &output, dim, &shift);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
@@ -40,6 +40,13 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 */
 XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift);

+/* 
+squared sum of the items along a dimension of the tensor
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift)^2
+*/
+void ReduceSumSquared(const XTensor &input, XTensor &output, int dim, const XTensor &shift, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __REDUCESUMSQUARED_H__

--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -84,4 +84,47 @@ XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)

    return output;
 }
+
+/* 
+variance of the items along a dimension of the tensor
+
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
+
+>> input - the input tensor
+>> output - the output tensor
+>> dim - the dimension where the reduction is performed on
+>> mean - the mean value
+>> requireLink - if add operation to network
+*/
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    if (!output.isInit || !XTensor::IsReduceShaped(&input, &output, dim)) {
+        int order = input.order - 1;
+        int * dimSize = new int[order];
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = input.dimSize[i];
+            else if (i >= dim)
+                dimSize[i] = input.dimSize[i + 1];
+        }
+
+        float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+        InitTensor(&output, order, dimSize, input.dataType, dr, input.devID, input.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _ReduceVariance function */
+    _ReduceVariance(&input, &output, dim, &mean);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
+        XLink::AddParamToHeadInt(&output, dim);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
@@ -39,6 +39,12 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean);

+/* 
+variance of the items along a dimension of the tensor
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
+*/
+void ReduceVariance(const XTensor &input, XTensor &output, int dim, const XTensor &mean, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __REDUCEVARIANCE_H__
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -37,7 +37,7 @@ or "Merge" by means of the tensor shapes
 >> big - the resulting tensor
 >> dim - which dimension we perform the concatenation
 */
-void _Concatenate(const XList * smalls, XTensor * big, int dim)
+void _Concatenate(const TensorList * smalls, XTensor * big, int dim)
 {
    bool uniform = true;
    for (int i = 1; i < smalls->count; i++) {
@@ -66,7 +66,7 @@ or "Merge" by means of the tensor shapes
 >> dim - which dimension we perform the concatenation
 << return - the tensor of concatenating a list of tensors along a given dimension
 */
-XTensor Concatenate(const XList &smalls, int dim)
+XTensor Concatenate(const TensorList &smalls, int dim)
 {
    CheckNTErrors(smalls.count > 0, "Empty list!");
    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
@@ -147,9 +147,9 @@ concatenate two tensors along a given dimension
 */
 void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim)
 {
-    XList smalls(2);
-    smalls.Add(smallA);
-    smalls.Add(smallB);
+    TensorList smalls(2);
+    smalls.Add((XTensor*)smallA);
+    smalls.Add((XTensor*)smallB);

    _Concatenate(&smalls, big, dim);
 }
@@ -168,9 +168,9 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
 {
    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");

-    XList smalls(2);
-    smalls.Add(&smallA);
-    smalls.Add(&smallB);
+    TensorList smalls(2);
+    smalls.Add((XTensor*)&smallA);
+    smalls.Add((XTensor*)&smallB);

    bool uniform = true;
    for (int i = 1; i < smalls.count; i++) {

--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
@@ -31,7 +31,7 @@ concatenate a list of tensors along a given dimension
 Note that this is actually a wrapper that selects 
 "ConcatenateSolely" or "Merge" by means of the tensor shapes 
 */
-void _Concatenate(const XList * smalls, XTensor * big, int dim);
+void _Concatenate(const TensorList * smalls, XTensor * big, int dim);

 /*
 concatenate a list of tensors along a given dimension (return an XTensor structure)
@@ -39,7 +39,7 @@ make a new tensor to keep the result and return it
 Note that this is actually a wrapper that selects 
 "ConcatenateSolely" or "Merge" by means of the tensor shapes 
 */
-XTensor Concatenate(const XList &smalls, int dim);
+XTensor Concatenate(const TensorList &smalls, int dim);

 /* concatenate two tensors along a given dimension */
 void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim);

--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -34,7 +34,7 @@ concatenate a list of tensors along a given dimension
 >> big - the resulting tensor
 >> dim - which dimension we perform the concatenation
 */
-void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
+void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim)
 {
    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");

@@ -85,12 +85,12 @@ void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
        }
    }
    else {
-        XList * sourceArrays = new XList(smalls->count);
+		StrList* sourceArrays = new StrList(smalls->count);
        int * blockSizes = new int[smalls->count];
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);
            blockSizes[i] = stride * tensor->dimSizeRDI[dimRDI] * tensor->unitSize;
-            sourceArrays->Add(tensor->data);
+            sourceArrays->Add((char*)tensor->data);
        }

        _MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);

--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* concatenate a list of tensors along a given dimension */
-void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);
+void _ConcatenateSolely(const TensorList * smalls, XTensor * big, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -148,6 +148,39 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    }
 }

+bool CheckMergeSize(const XTensor * s, const XTensor * t, int whereToMerge, int leadingDim)
+{
+    if (!(s && t))
+        return false;
+
+    if (!(s->dataType == t->dataType))
+        return false;
+
+    if (leadingDim < 0)
+        leadingDim = 0;
+    int order = s->order - 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < s->order; i++) {
+        if (i < leadingDim)
+            dimSize[i] = s->dimSize[i];
+        else if (i > leadingDim) {
+            if (i != whereToMerge)
+                dimSize[i - 1] = s->dimSize[i];
+            else
+                dimSize[i - 1] = s->dimSize[i] * s->dimSize[leadingDim];
+        }
+    }
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != t->dimSize[i])
+            return false;
+    }
+
+    return true;
+}
+
+
 /*
 transform a tensor by merging it along with a dimension (return an XTensor structure)
 make a new tensor to keep the result and  return it
@@ -199,6 +232,43 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    return t;
 }

+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim, bool requireLink)
+{
+    if (!t.isInit || !CheckMergeSize(&s, &t, whereToMerge, leadingDim)) {
+        if (leadingDim < 0)
+            leadingDim = 0;
+        int order = s.order - 1;
+        int * dimSize = new int[order];
+
+        for (int i = 0; i < s.order; i++) {
+            if (i < leadingDim)
+                dimSize[i] = s.dimSize[i];
+            else if (i > leadingDim) {
+                if (i != whereToMerge)
+                    dimSize[i - 1] = s.dimSize[i];
+                else
+                    dimSize[i - 1] = s.dimSize[i] * s.dimSize[leadingDim];
+            }
+        }
+
+        float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+        InitTensor(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _Merge function */
+    _Merge(&s, &t, whereToMerge, leadingDim);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
+        XLink::AddParamToHeadInt(&t, whereToMerge);
+        XLink::AddParamToHeadInt(&t, leadingDim);
+    }
+}
+
 /*
 merge small tensors into a big tensor

@@ -206,7 +276,7 @@ merge small tensors into a big tensor
 >> big - the merged tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
 */
-void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
+void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge)
 {
    whereToMerge = (whereToMerge < 0 ? big->order - 1 : whereToMerge);

@@ -217,15 +287,15 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    bool uniform = true;

    int mergeNum = smalls->count;
-    XTensor* smallsItem0 = (XTensor*)(smalls->GetItem(0));
+    XTensor* smallsItem0 = smalls->GetItem(0);
    int itemSize = smallsItem0->unitNum * smallsItem0->unitSize;

    for (int i = 0; i < smalls->count; i++) {
-        XTensor* smallsItem = (XTensor*)smalls->GetItem(i);
+        XTensor* smallsItem = smalls->GetItem(i);
        CheckNTErrors((big->unitNum == smallsItem->unitNum * mergeNum), "Unmatched tensors!");

        if (i > 0) {
-            XTensor * preItem = (XTensor*)smalls->GetItem(i - 1);
+            XTensor * preItem = smalls->GetItem(i - 1);
            if (smallsItem->unitNum * smallsItem->unitSize != (char*)smallsItem->data - (char*)preItem->data)
                uniform = false;
        }
@@ -237,7 +307,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    int gridNum = 1;
    int mergedNum = smalls->count;

-    XTensor * s0 = (XTensor*)smalls->GetItem(0);
+    XTensor * s0 = smalls->GetItem(0);
    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
        if (i <= whereToMergeRDI)
@@ -263,7 +333,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        for (int g = 0; g < gridNum; g++) {
            char * tData = (char*)big->data + g * blockSize * blockNum * big->unitSize;
            for (int k = 0; k < mergedNum; k++) {
-                XTensor * s = (XTensor*)smalls->GetItem(k);
+                XTensor * s = smalls->GetItem(k);
                char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
                XMemCopy2D(tData + k * tStep, tPtich, big->devID,
                    sData + k * sStep, sPitch, s->devID,
@@ -295,7 +365,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        /* copy from source to tmp */
        if (!uniform) {
            for (int i = 0; i < mergeNum; i++) {
-                XTensor* smallsItem = (XTensor*)smalls->GetItem(i);
+                XTensor* smallsItem = smalls->GetItem(i);
                XMemCopy((char*)(tensorTMP->data) + (itemSize * i), tensorTMP->devID, smallsItem->data, smallsItem->devID, itemSize);
            }
        }
@@ -322,9 +392,9 @@ make a new tensor to keep the result and return it
 >> whereToMerge - the merging operation is along with which dimension
 << return - the big tensor merged by small tensors
 */
-XTensor Merge(const XList &smalls, int whereToMerge)
+XTensor Merge(const TensorList &smalls, int whereToMerge)
 {
-    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    XTensor * tensor = smalls.GetItem(0);
    int order = tensor->order;
    int * dimSize = new int[order];
    for (int i = 0; i < tensor->order; i++) {
@@ -375,9 +445,9 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
    XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
    big.SetTMPFlag();

-    XList smalls(2);
-    smalls.Add(&smallA);
-    smalls.Add(&smallB);
+    TensorList smalls(2);
+    smalls.Add((XTensor*)&smallA);
+    smalls.Add((XTensor*)&smallB);

    /* call _Merge function */
    _Merge(&smalls, &big, whereToMerge);

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -33,15 +33,21 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -
   e.g., (M, N/3, 3) -> (M, N) */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);

+void Merge(const XTensor &s, XTensor &t, int whereToMerge, int leadingDim = -1, bool requireLink = false);
+
 /* merge small tensors into a big tensor */
-void _Merge(const XList * smalls, XTensor * big, int whereToMerge);
+void _Merge(const TensorList * smalls, XTensor * big, int whereToMerge);

 /* merge small tensors into a big tensor (return an XTensor structure) */
-XTensor Merge(const XList &smalls, int whereToMerge);
+XTensor Merge(const TensorList &smalls, int whereToMerge);
+
+void Merge(const TensorList &smalls, XTensor &t, int whereToMerge);

 /* merge two tensors into a big tensor (return an XTensor structure) */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);

+void Merge(const XTensor &smallA, const XTensor &smallB, XTensor &t, int whereToMerge);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MERGE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
@@ -34,7 +34,7 @@ merge data by blocks
 >> target - target data array
 >> myMem - memory pool
 */
-void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _MergeBlockLists(const StrList* sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -71,7 +71,7 @@ merge data by blocks (cuda version)
 >> target - target data array
 >> myMem - the memory pool
 */
-void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");
    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");

--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
@@ -33,7 +33,7 @@ __global__
 void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);

 /* merge data by blocks (cuda version) */
-void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _CudaMergeBlockLists(const StrList* sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);

 #endif // USE_CUDA


--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* merge data by blocks */
-void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _MergeBlockLists(const StrList* sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
@@ -48,4 +48,19 @@ XTensor Reshape(XTensor &s, int order, int * dimSize)
 	return t;
 }

+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink)
+{
+    if (!t.isInit || !XTensor::IsSameShaped(&t, &s)) {
+        InitTensor(&t, &s);
+    }
+
+    /* call Reshape function */
+    t.Reshape(order, dimSize);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&s, NULL, &t, SHAPE_RESHAPE);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
@@ -29,5 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* reshape the tensor */
 XTensor Reshape(XTensor &s, int order, int * dimSize);

+void Reshape(XTensor &s, XTensor &t, int order, int * dimSize, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)
 #endif // __RESHAPE_H__
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -156,6 +156,33 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    }
 }

+bool CheckSplitSize(const XTensor * s, const XTensor * t, int whereToSplit, int splitNum)
+{
+    if (!(s && t))
+        return false;
+
+    if (!(s->dataType == t->dataType))
+        return false;
+
+    int order = s->order + 1;
+    int * dimSize = new int[order];
+
+    dimSize[0] = splitNum;
+    for (int i = 0; i < s->order; i++) {
+        if (i == whereToSplit)
+            dimSize[i + 1] = s->dimSize[i] / splitNum;
+        else
+            dimSize[i + 1] = s->dimSize[i];
+    }
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != t->dimSize[i])
+            return false;
+    }
+
+    return true;
+}
+
 /*
 transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return an XTensor structure)
 make a new tensor to keep the result and return it
@@ -200,6 +227,38 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    return t;
 }

+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink)
+{
+    if (!t.isInit || !CheckSplitSize(&s, &t, whereToSplit, splitNum)) {
+        int order = s.order + 1;
+        int * dimSize = new int[order];
+
+        dimSize[0] = splitNum;
+        for (int i = 0; i < s.order; i++) {
+            if (i == whereToSplit)
+                dimSize[i + 1] = s.dimSize[i] / splitNum;
+            else
+                dimSize[i + 1] = s.dimSize[i];
+        }
+
+        float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+        InitTensor(&t, order, dimSize, s.dataType, dr, s.devID, s.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _Split function */
+    _Split(&s, &t, whereToSplit, splitNum);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
+        XLink::AddParamToHeadInt(&t, whereToSplit);
+        XLink::AddParamToHeadInt(&t, splitNum);
+    }
+}
+
 /*
 split a big tensor into small tensors

@@ -209,7 +268,7 @@ split a big tensor into small tensors
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
 */
-void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
+void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int splitNum)
 {
    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
@@ -340,7 +399,7 @@ split a big tensor into small tensors
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
 */
-void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
+void Split(const XTensor &big, TensorList &smalls, int whereToSplit, int splitNum)
 {
    CheckNTErrors(big.GetDim(whereToSplit) % splitNum == 0, "Wrong splitNum!");


--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -41,14 +41,16 @@ e.g., (M, N) -> (M, N/3, 3)
 */
 XTensor Split(const XTensor &s, int whereToSplit, int splitNum);

+void Split(const XTensor &s, XTensor &t, int whereToSplit, int splitNum, bool requireLink = false);
+
 /* split a big tensor into small tensors */
-void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum);
+void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int splitNum);

 /* 
-split a big tensor into small tensors (return a XList structure)
+split a big tensor into small tensors (return a TensorList structure)
 make a new list to keep the result and return it
 */
-void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum);
+void Split(const XTensor &big, TensorList &smalls, int whereToSplit, int splitNum);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
@@ -112,4 +112,19 @@ XTensor Squeeze(XTensor & source, int leadingDim)
    return target;
 }

+void Squeeze(XTensor & source, XTensor & target, int leadingDim, bool requireLink)
+{
+    if (!target.isInit || !XTensor::IsSameShaped(&source, &target)) {
+        InitTensor(&target, &source);
+    }
+
+    /* call _Squeeze function */
+    _Squeeze(&source, &target, leadingDim);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&source, NULL, &target, SHAPE_SQUEEZE);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
@@ -37,6 +37,8 @@ void _SqueezeMe(XTensor * source, int leadingDim = -1);
   make a new tensor to keep the result and return it */
 XTensor Squeeze(XTensor & source, int leadingDim = -1);

+void Squeeze(XTensor & source, XTensor & target, int leadingDim = -1, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __SQUEEZE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -78,7 +78,7 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
 #endif
    }
    else {
-        XList * sourceArrays = new XList(blockNumB);
+        StrList * sourceArrays = new StrList(blockNumB);
        int * blockSizes = new int[blockNumB];

        for (int i = 0; i < blockNumA; i++) {
@@ -96,6 +96,34 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
    }
 }

+bool CheckUnsqueezeSize(const XTensor * a, const XTensor * b, int dim, int dSize)
+{
+    if (!(a && b))
+        return false;
+
+    if (!(a->dataType == b->dataType))
+        return false;
+
+    int order = a->order + 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        if (i < dim)
+            dimSize[i] = a->dimSize[i];
+        else if (i == dim)
+            dimSize[i] = dSize;
+        else
+            dimSize[i] = a->dimSize[i - 1];
+    }
+
+    for (int i = 0; i < order; i++) {
+        if (dimSize[i] != b->dimSize[i])
+            return false;
+    }
+
+    return true;
+}
+
 /*
 insert a dimension by copying the blocks for x times
 (where x is the size of the inerted dimension) (returna a XTensor structure)
@@ -138,4 +166,37 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
    return b;
 }

+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink)
+{
+    if (!b.isInit || !CheckUnsqueezeSize(&a, &b, dim, dSize)) {
+        int order = a.order + 1;
+        int * dimSize = new int[order];
+
+        for (int i = 0; i < order; i++) {
+            if (i < dim)
+                dimSize[i] = a.dimSize[i];
+            else if (i == dim)
+                dimSize[i] = dSize;
+            else
+                dimSize[i] = a.dimSize[i - 1];
+        }
+
+        float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+        InitTensor(&b, order, dimSize, a.dataType, dr, a.devID, a.mem);
+
+        /* destroy variables */
+        delete[] dimSize;
+    }
+
+    /* call _Unsqueeze function */
+    _Unsqueeze(&a, &b, dim, dSize);
+
+    if (requireLink) {
+        /* tensor connections */
+        XLink::MakeLink(&a, NULL, &b, SHAPE_UNSQUEEZE);
+        XLink::AddParamToHeadInt(&b, dim);
+        XLink::AddParamToHeadInt(&b, dSize);
+    }
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
@@ -35,6 +35,8 @@ void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
   make a new tensor to keep the result and return it */
 XTensor Unsqueeze(const XTensor &a, int dim, int dSize);

+void Unsqueeze(const XTensor &a, XTensor &b, int dim, int dSize, bool requireLink = false);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __UNSQUEEZE_H__
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -114,12 +114,12 @@ void Sort(XTensor & a, XTensor & b, XTensor & index, int dim)
    _Sort(&a, &b, &index, dim);
    
    /* tensor connections */
-    XList list(2);
-    list.Add(&b);
-    list.Add(&index);
-     XLink::MakeLink(&a, &list, SORT_SORT);
-     XLink::AddParamToHeadInt(&b, dim);
-     XLink::AddParamToHeadInt(&index, dim);
+    //TensorList list(2);
+    //list.Add(&b);
+    //list.Add(&index);
+    // XLink::MakeLink(&a, &list, SORT_SORT);
+    // XLink::AddParamToHeadInt(&b, dim);
+    // XLink::AddParamToHeadInt(&index, dim);
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -128,14 +128,14 @@ void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
        _TopK(&a, &b, &index, dim, k);

    /* tensor connection */
-    XList list(2);
-    list.Add(&b);
-    list.Add(&index);
-    XLink::MakeLink(&a, &list, SORT_TOPK);
-    XLink::AddParamToHeadInt(&b, dim);
-    XLink::AddParamToHeadInt(&index, k);
-    XLink::AddParamToHeadInt(&b, dim);
-    XLink::AddParamToHeadInt(&index, k);
+    //TensorList list(2);
+    //list.Add(&b);
+    //list.Add(&index);
+    //XLink::MakeLink(&a, &list, SORT_TOPK);
+    //XLink::AddParamToHeadInt(&b, dim);
+    //XLink::AddParamToHeadInt(&index, k);
+    //XLink::AddParamToHeadInt(&b, dim);
+    //XLink::AddParamToHeadInt(&index, k);
 }



--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
@@ -31,7 +31,7 @@ flush a list of XTensor to GPU memory
 >> devID - target GPU id
 >> GPUMem - memory pool for the GPU
 */
-void CPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
+void CPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
 {
 #ifdef USE_CUDA
    CudaCPUToGPUFlush(mList, devID, GPUMem);

--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
@@ -32,7 +32,7 @@ flush a list of XTensor to GPU memory
 >> devID - target GPU id
 >> GPUMem - memory pool for the GPU
 */
-void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
+void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem)
 {
    if (mList == NULL || mList->count == 0)
        return;

--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* flush a list of XTensor to GPU memory */
-void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem);
+void CudaCPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem);

 /* copy the data from GPU memory to CPU memory */
 void CudaGPUToCPUFlush(XTensor * tensor);

--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* flush a list of XTensor to GPU memory */
-void CPUToGPUFlush(XList * mList, int devID, XMem * GPUMem);
+void CPUToGPUFlush(TensorList * mList, int devID, XMem * GPUMem);

 /* copy the data from GPU memory to CPU memory */
 void GPUToCPUFlush(XTensor * tensor);

--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
@@ -51,19 +51,19 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    CheckNTErrors(jobNum != 0, "TODO!");

    /* argument list of the jobs */
-    XList * jobArgList = new XList(4);
+    TensorList * jobArgList = new TensorList(argNum);

    va_list ap;
    va_start(ap, argNum);
    for (int i = 0; i < argNum; i++) {
-        void * p = va_arg(ap, void*);
+		XTensor* p = va_arg(ap, XTensor*);
        jobArgList->Add(p);
    }
    va_end(ap);

    /* prepare the neccesary argument list for parallel processing */
-    XList * jobs = new XList(jobNum);
-    XList * args = new XList(jobNum);
+    TensorList * jobs = new TensorList(jobNum);
+    TensorList * args = new TensorList(jobNum);

    int * indexList = new int[jobNum * 4 * 4];

@@ -77,27 +77,30 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    2. other arguments
    */
    for (int i = 0; i < jobNum; i++) {
-        XList * blockArgs = new XList(argNum + 4);
+		IntList* indexArgs = new IntList(4);
+        TensorList * blockArgs = new TensorList(argNum);
        int * blockIndex = indexList + i * 4;

-        blockArgs->Add(blockIndex);
-        blockArgs->Add(blockIndex + 1);
-        blockArgs->Add(blockIndex + 2);
-        blockArgs->Add(blockIndex + 3);
+		indexArgs->Add(blockIndex[0]);
+		indexArgs->Add(blockIndex[1]);
+		indexArgs->Add(blockIndex[2]);
+		indexArgs->Add(blockIndex[3]);

        for (int j = 0; j < argNum; j++)
            blockArgs->Add(jobArgList->GetItem(j));

-        args->Add(blockArgs);
-        jobs->Add((void*)job);
+		args->Add((XTensor*)indexArgs);
+        args->Add((XTensor*)blockArgs);
+
+        jobs->Add((XTensor*)job);
    }

-    args->count = nblock;
+    args->count = jobNum * 2;
    jobs->count = nblock;

    /* single job */
    if (jobNum == 1)
-        ((TFunction)job)((XList*)args->GetItem(0));
+        ((TFunction)job)(args);
    /* multiple jobs */
    else
        parallelRunner->Run(jobs, args);
@@ -105,7 +108,7 @@ void RunParallel2D(XPRunner * parallelRunner, void * job,
    /* free the memory */
    delete[] indexList;
    for (int i = 0; i < args->count; i++) {
-        XList * blockArgs = (XList*)args->GetItem(i);
+        TensorList * blockArgs = (TensorList*)args->GetItem(i);
        delete blockArgs;
    }
    delete args;

--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
@@ -21,12 +21,14 @@

 #include "../XName.h"
 #include <time.h>
+#include <math.h>
 #include "Dropout.h"
 #include "Dropout.cuh"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/arithmetic/MultiplyDim.h"
 #include "../core/math/ScaleAndShift.h"
 #include "../core/getandset/SetData.h"
+#include "DropoutWithIndex.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor

@@ -147,6 +149,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
    CheckNTErrors(dropProb >= 0.0 && dropProb <= 1.0, "The probability must be 0-1!");

    XTensor mask;
+    int * maskArrayInt = NULL;
    DTYPE * maskArray = NULL;
    DTYPE scaleFactor = (DTYPE)1.0 / ((DTYPE)1.0 - dropProb);

@@ -157,6 +160,23 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
        _SetDataRandP(&mask, 0, 1.0F, dropProb, scaleFactor);

        return Multiply(x, mask);
+
+        /* dropout with index */
+        /*int unitNum = floor(x.unitNum*dropProb);
+        maskArrayInt = new int[unitNum];
+
+        for (int i = 0; i < unitNum; i++)
+            maskArrayInt[i] = rand() % x.unitNum;
+
+        XTensor maskindex;
+        InitTensor1D(&maskindex, unitNum, X_INT, x.devID, x.mem);
+
+        maskindex.SetData(maskArrayInt, unitNum);
+
+        delete[] maskArrayInt;
+
+        return DropoutWithIndex(x, maskindex, scaleFactor);*/
+
    }
    else if(leadingDim2 < 0){
        int n = leadingDim;
@@ -209,7 +229,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
        return MultiplyBroadcast(x, mask);
    }

-    
 }

 /* 

--- a/source/tensor/function/DropoutWithIndex.cpp
+++ b/source/tensor/function/DropoutWithIndex.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Jiang Yufan (email: jiangyufan2018@outlook.com) 2019-03-20
+*/
+
+#include "DropoutWithIndex.h"
+#include "DropoutWithIndex.cuh"
+#include "../core/CHeader.h"
+#include "../XName.h"
+#include "Identity.h"
+
+namespace nts {
+
+/*
+This is a special implementation of "dropout" to reduce memory with maskIndex.
+
+>> x - input tensor
+>> maskIndex - mask index tensor
+>> c - output tensor
+*/
+void _DropoutWithIndex(const XTensor * x, XTensor * maskIndex, XTensor * c)
+{
+    CheckNTErrors(maskIndex->order == 1, "Illegal tensor order!");
+
+#ifdef USE_CUDA
+    if (maskIndex->devID >= 0 || x->devID >= 0 || c->devID >= 0) {
+        _CudaDropoutWithIndex(x, maskIndex, c);
+        return;
+    }
+#endif
+
+    // TODO!!
+    ShowNTErrors("TODO!");
+}
+
+/*
+This is a special implementation of "dropout" to reduce memory with maskIndex.
+
+>> x - input tensor
+>> maskIndex - mask index tensor
+>> c - output tensor
+>> scale - scale factor
+*/
+XTensor DropoutWithIndex(const XTensor &x, XTensor &maskIndex, DTYPE scale)
+{
+    XTensor c;
+
+    int order = x.order;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        dimSize[i] = x.dimSize[i];
+    }
+
+    InitTensor1D(&c, x.unitNum, x.dataType, x.devID, x.mem);
+
+    _SetDataFixedFloat(&c, 1.0F);
+
+    _DropoutWithIndex(&x, &maskIndex, &c);
+
+    c.Reshape(order, dimSize);
+
+    _MultiplyMe(&c, &x);
+
+    _ScaleAndShiftMe(&c, scale);
+
+    /* tensor connections */
+    XLink::MakeLink(&x, &maskIndex, &c, MOVEMENT_DROPOUTWITHINDEX);
+    XLink::AddParamToHead(&c, scale);
+
+    return c;
+}
+
+}// namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/DropoutWithIndex.cu
+++ b/source/tensor/function/DropoutWithIndex.cu
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Jiang Yufan (email: jiangyufan2018@outlook.com) 2019-03-20
+*/
+
+#include "DropoutWithIndex.cuh"
+#include "../XDevice.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+__global__
+
+/*
+This is a special implementation of "dropout" to reduce memory with maskIndex.
+
+>> tData - the data pointer of the target tensor
+>> sIndex - mask index
+>> size - the size of the sIndex
+*/
+void KernelDropoutWithIndex1D(DTYPE * tData, int * sIndex, int size)
+{
+    /* block id */
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    
+    DTYPE * t = tData;
+    
+    if (i < size) {
+
+        int id = sIndex[i];
+
+        t[id] = DTYPE(0.0F);
+    }
+    
+}
+
+/*
+This is a special implementation of "dropout" to reduce memory with maskIndex.
+
+>> x - input tensor
+>> maskIndex - mask index tensor
+>> c - output tensor
+*/
+void _CudaDropoutWithIndex(const XTensor * x, XTensor * maskIndex, XTensor * c)
+{
+    int devID = c->devID;
+
+    int blockNum = maskIndex->unitNum;
+
+    int cudaGrids[3];
+    int cudaBlocks[3];
+
+    int devIDBackup;
+    ProtectCudaDev(devID, devIDBackup);
+
+    GDevs.GetCudaThread(devID, blockNum, cudaGrids, cudaBlocks);
+
+    dim3 blocks(cudaGrids[0]);
+    dim3 threads(cudaBlocks[0]);
+
+    DTYPE * tData = (DTYPE*)c->data;
+    int * sIndex = NULL;
+
+    sIndex = (int *)maskIndex->data;
+
+    KernelDropoutWithIndex1D <<<blocks, threads >>>(tData, sIndex, blockNum);
+
+    BacktoCudaDev(devID, devIDBackup);
+}
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/function/DropoutWithIndex.cuh
+++ b/source/tensor/function/DropoutWithIndex.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Jiang Yufan (email: jiangyufan2018@outlook.com) 2019-03-20
+*/
+
+#ifndef __DROPOUTWITHINDEX_CUH__
+#define __DROPOUTWITHINDEX_CUH__
+
+#include "../XTensor.h"
+#include "DropoutWithIndex.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+#ifdef USE_CUDA
+
+/* dropout with index (cuda version) */
+void _CudaDropoutWithIndex(const XTensor * x, XTensor * maskIndex, XTensor * c);
+
+#endif // USE_CUDA
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __DROPOUTWITHINDEX_CUH__
--- a/source/tensor/function/DropoutWithIndex.h
+++ b/source/tensor/function/DropoutWithIndex.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Jiang Yufan (email: jiangyufan2018@outlook.com) 2019-03-20
+*/
+
+#ifndef __DROPOUTWITHINDEX_H__
+#define __DROPOUTWITHINDEX_H__
+
+#include "../XTensor.h"
+
+namespace nts {
+
+void _DropoutWithIndex(const XTensor * x, XTensor * maskIndex, XTensor * c);
+
+XTensor DropoutWithIndex(const XTensor &x, XTensor &mask, DTYPE scale);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // !__DROPOUTWITHINDEX_H__
+
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
@@ -26,7 +26,6 @@

 #include "../XTensor.h"

-#include "CrossEntropy.h"
 #include "Dropout.h"
 #include "HardTanH.h"
 #include "Identity.h"

--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -23,7 +23,7 @@
 #include "../XName.h"
 #include "HardTanH.h"
 #include "HardTanH.cuh"
-#include "CrossEntropy.h"
+#include "../loss/LHeader.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -84,6 +84,21 @@ XTensor HardTanH(const XTensor &x)
    return y;
 }

+void HardTanH(const XTensor &x, XTensor &y, bool requireLink)
+{
+    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _HardTanH function */
+    _HardTanH(&x, &y);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);
+    }
+}
+
 /*
 backward computation


--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -22,7 +22,7 @@
 #include "HardTanH.h"
 #include "HardTanH.cuh"
 #include "Loss.cuh"
-#include "CrossEntropy.cuh"
+#include "../loss/CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -40,6 +40,8 @@ void _HardTanH(const XTensor * x, XTensor * y);
 /* hard tanh function (return an XTensor structure) */
 XTensor HardTanH(const XTensor &x);

+void HardTanH(const XTensor &x, XTensor &y, bool requireLink = false);
+
 /* de/dx */
 void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x, 
                       XTensor * dedy, XTensor * dedx,

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -21,7 +21,7 @@

 #include "../XName.h"
 #include "Identity.h"
-#include "CrossEntropy.h"
+#include "../loss/LHeader.h"
 #include "../XUtility.h"
 #include "../core/movement/CopyValues.h"

@@ -57,6 +57,22 @@ XTensor Identity(const XTensor &x)

    return y;
 }
+
+void Identity(const XTensor &x, XTensor &y, bool requireLink)
+{
+    if (!y.isInit || !y.IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _Identity function */
+    _Identity(&x, &y);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
+    }
+}
+
 /* 
 backward computation for identity function y = x 


--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -33,6 +33,8 @@ void _Identity(const XTensor * x, XTensor * y);
 /* identity function y = x (return an XTensor structure) */
 XTensor Identity(const XTensor &x);

+void Identity(const XTensor &x, XTensor &y, bool requireLink = false);
+
 /* de/dx */
 void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x, 
                       XTensor * dedy, XTensor * dedx,

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -194,6 +194,25 @@ XTensor LogSoftmax(const XTensor &x, int leadDim)
    return y;
 }

+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+{
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
+    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _LogSoftmax function */
+    _LogSoftmax(&x, &y, ld);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
+        XLink::AddParamToHeadInt(&y, ld);
+    }
+}
 /* 
 log scale softmax y = log(e^x / \sum_{i} e^{x_i})
 make a new tensor to keep the result and return it

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -33,6 +33,8 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return an XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);

+void LogSoftmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
+
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (with both argument of x and y) */
 void LogSoftmax(const XTensor &x, XTensor &y, int leadDim);


--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -22,7 +22,7 @@
 #include "../XName.h"
 #include "Rectify.h"
 #include "Rectify.cuh"
-#include "CrossEntropy.h"
+#include "../loss/LHeader.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -77,6 +77,20 @@ XTensor Rectify(const XTensor &x)
    return y;
 }

+void Rectify(const XTensor &x, XTensor &y, bool requireLink)
+{
+    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _Rectify function */
+    _Rectify(&x, &y);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
+    }
+}

 /*
 backward computation

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -22,7 +22,7 @@
 #include "Rectify.h"
 #include "Rectify.cuh"
 #include "Loss.cuh"
-#include "CrossEntropy.cuh"
+#include "../loss/CrossEntropy.cuh"
 #include "../XDevice.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -33,6 +33,8 @@ void _Rectify(const XTensor * x, XTensor * y);
 /* rectify function y = max(0, x) (return an XTensor structure) */
 XTensor Rectify(const XTensor &x);

+void Rectify(const XTensor &x, XTensor &y, bool requireLink = false);
+
 /* de/dx */
 void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx,

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -23,7 +23,7 @@
 #include <math.h>
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"
-#include "CrossEntropy.h"
+#include "../loss/LHeader.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

@@ -75,6 +75,21 @@ XTensor Sigmoid(const XTensor &x)
    return y;
 }

+void Sigmoid(const XTensor &x, XTensor &y, bool requireLink)
+{
+    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _Sigmoid function */
+    _Sigmoid(&x, &y);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
+    }
+}
+
 /*
 backward computation


--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -22,7 +22,7 @@
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"
 #include "Loss.cuh"
-#include "CrossEntropy.cuh"
+#include "../loss/CrossEntropy.cuh"
 #include "../XDevice.h"

 #ifdef USE_CUDA

--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -33,6 +33,8 @@ void _Sigmoid(const XTensor * x, XTensor * y);
 /* sigmoid function y = 1/(1+exp(-x)) (return an XTensor structure) */
 XTensor Sigmoid(const XTensor &x);

+void Sigmoid(const XTensor &x, XTensor &y, bool requireLink = false);
+
 /* de/dx */
 void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx,

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -148,6 +148,26 @@ XTensor Softmax(const XTensor &x, int leadDim)
    return y;
 }

+void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink)
+{
+    int ld = leadDim;
+    if (ld < 0)
+        ld = x.order - 1;
+
+    if (!y.isInit || !XTensor::IsSameShaped(&y, &x)) {
+        InitTensor(&y, &x);
+    }
+
+    /* call _Softmax function */
+    _Softmax(&x, &y, ld);
+
+    if (requireLink) {
+        /* tensor connection */
+        XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
+        XLink::AddParamToHeadInt(&y, ld);
+    }
+}
+
 /*
 backward computation for dense tensors


--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -372,27 +372,16 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            int * dimSize = new int[y->order];
            for(int i = 0; i < y->order; i++){
                if(i < leadDim)
-                    dimSize[i] = -y->dimSize[i];
+                    dimSize[i] = y->dimSize[i];
                else if(i > leadDim)
-                    dimSize[i - 1] = -y->dimSize[i];
+                    dimSize[i - 1] = y->dimSize[i];
            }

-            XMem * mem = y->mem;
-
            /* make a matrix of the same size as the y (i.e., y) */
-            XTensor * ytmp = NewTensor(y, false);
+            XTensor * ytmp = NewTensor(y);

            /* make a matrix to keep \beta */
-            XTensor * beta = new XTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, mem);
-
-            if(mem != NULL){
-                ytmp->data = mem->AllocBuf(mem->devID, y->unitNum * y->unitSize);
-                beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
-            }
-            else{
-                ytmp->data = XMemAlloc(y->devID, y->unitNum * y->unitSize);
-                beta->data = XMemAlloc(y->devID, beta->unitNum * beta->unitSize);
-            }
+            XTensor * beta = NewTensor(y->order - 1, dimSize, y->dataType, y->denseRatio, y->devID, y->mem);

            /* \beta = \sum_i (dE/dy_i * y_i) */
            _Multiply(dedy, y, ytmp, 0, 0);
@@ -405,19 +394,6 @@ void _CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
            _Multiply(y, ytmp, dedx, 0, 0);

-
-            if(mem != NULL){
-                mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
-                mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);
-            }
-            else{
-                XMemFree(y->devID, ytmp->data);
-                XMemFree(y->devID, beta->data);
-            }
-
-            ytmp->data = NULL;
-            beta->data = NULL;
-
            delete[] dimSize;
            delete ytmp;
            delete beta;

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -33,6 +33,8 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim);
 /* softmax y = e^x / \sum_{i} e^{x_i} (return an XTensor structure) */
 XTensor Softmax(const XTensor &x, int leadDim);

+void Softmax(const XTensor &x, XTensor &y, int leadDim, bool requireLink = false);
+
 /* de/dx */
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx, 

--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
@@ -22,6 +22,8 @@
 #include <math.h>
 #include "CrossEntropy.h"
 #include "CrossEntropy.cuh"
+#include "../XTensor.h"
+#include "../XName.h"
 #include "../core/arithmetic/MultiplyDim.h"
 #include "../core/arithmetic/Multiply.h"
 #include "../core/math/Unary.h"
@@ -61,7 +63,7 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
    CheckNTErrors(loss->order == output->order - 1, "Wrong loss dimension!");
    CheckNTErrors(gold->dataType == DEFAULT_DTYPE && output->dataType == DEFAULT_DTYPE, "TODO!");

-    XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
+    /*XTensor * interBuf1 = NewTensorBuf(output, output->devID, output->mem);
    XTensor * interBuf2 = NewTensorBuf(output, output->devID, output->mem);
    
    _Log(output, interBuf1);
@@ -76,7 +78,23 @@ void _CrossEntropy(const XTensor * output, const XTensor * gold,
        _MultiplyMe(loss, padding);

    DelTensorBuf(interBuf2);
-    DelTensorBuf(interBuf1);
+    DelTensorBuf(interBuf1);*/
+
+    XTensor * inter = NewTensor(output);
+    
+    _Log(output, inter);
+    _MultiplyMe(inter, gold);
+
+    if(weight != NULL)
+        _MultiplyDimMe(inter, weight, n);
+    _NegateMe(inter);
+    _ReduceSum(inter, loss, n);
+
+    if(padding != NULL)
+        _MultiplyMe(loss, padding);
+
+    DelTensor(inter);
+
 }

 /*
@@ -223,6 +241,93 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
 }

 /*
+
+*/
+XTensor GetReduceTensor(const XTensor & input, int dim)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i >= dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
+    output.SetTMPFlag();
+
+    return output;
+}
+
+/*
+compute the cross entropy loss (return an XTensor structure) 
+make a new tensor to keep the result and return it
+
+loss = sum_{i} (-gold_i * log(output_i))
+where gold and output are distributions 
+
+>> output - model prediction
+>> gold - gold standard
+>> loss - compute loss
+>> weight - a rescaling weight given to each class
+>> padding - specify a target value that is ignored and does not contribute to the loss computation
+>> leadingDim - the leading dimension for the output
+*/
+XTensor CrossEntropy(const XTensor & output, const XTensor & gold,
+                     int leadingDim)
+{
+    int dim = leadingDim < 0 ? output.order - 1 : leadingDim;
+    XTensor loss;
+    loss = GetReduceTensor(output, dim);
+
+    XTensor * weight = NULL;
+    XTensor * padding = NULL;
+
+    /* call _CrossEntropy function */
+    _CrossEntropy(&output, &gold, &loss, weight, padding, dim);
+
+    /* tensor connection */
+    TensorList tails(4);
+    tails.Add((XTensor*)&output);
+    tails.Add((XTensor*)&gold);
+    tails.Add(weight);
+    tails.Add(padding);
+    XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
+    XLink::AddParamToHeadInt(&loss, dim);
+
+    return loss;
+}
+
+XTensor CrossEntropy(const XTensor & output, const XTensor & gold,
+                     const XTensor & padding,
+                     int leadingDim)
+{
+    int dim = leadingDim < 0 ? output.order - 1 : leadingDim;
+    XTensor loss;
+    loss = GetReduceTensor(output, dim);
+
+    XTensor * weight = NULL;
+
+    /* call _CrossEntropy function */
+    _CrossEntropy(&output, &gold, &loss, weight, &padding, dim);
+
+    /* tensor connection */
+    TensorList tails(4);
+    tails.Add((XTensor*)&output);
+    tails.Add((XTensor*)&gold);
+    tails.Add(weight);
+    tails.Add((XTensor*)&padding);
+    XLink::MakeLink(&tails, &loss, LOSS_CROSSENTROPY);
+    XLink::AddParamToHeadInt(&loss, dim);
+
+    return loss;
+}
+
+/*
 compute the cross entropy loss
 loss = sum_{i} (-gold_i * log(output_i))
 where gold and output are distributions 
@@ -579,16 +684,16 @@ void _CrossEntropyBackward(XTensor * dedy, const XTensor * output,
        }
    }

-    //if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
-    //}
-    //else {
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    //}
+    if(padding != NULL) {
+        XTensor * tmp = NewTensor(padding);
+        _IsNonZero(padding, tmp);
+        int nonZeroNum = (int)_ReduceSumAll(tmp);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        delete tmp;
+    }
+    else {
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
+    }
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
@@ -196,16 +196,17 @@ void _CudaCrossEntropyBackward(XTensor * dedy, const XTensor * output,
        delete[] dims;
    }

-    //if(padding != NULL) {
-    //    XTensor * tmp = NewTensor(padding);
-    //    _IsNonZero(padding, tmp);
-    //    int nonZeroNum = (int)_ReduceSumAll(tmp);
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
-    //    delete tmp;
-    //}
-    //else {
-    //    _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)blockNum);
-    //}
+    if(padding != NULL) {
+        XTensor * tmp = NewTensor(padding);
+        _IsNonZero(padding, tmp);
+        int nonZeroNum = (int)_ReduceSumAll(tmp);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)nonZeroNum);
+        delete tmp;
+    }
+    else {
+        int num = dedy->unitNum / dedy->GetDim(n);
+        _ScaleAndShiftMe(dedy, (DTYPE)1.0/(DTYPE)num);
+    }

 }


--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
@@ -41,6 +41,25 @@ void _CrossEntropyFast(const XTensor * output, const XTensor * gold,
                         XTensor * loss, const XTensor * weight = NULL, 
                         const XTensor * padding = NULL, int leadingDim = -1);

+/* compute the cross entropy loss */
+XTensor CrossEntropy(const XTensor & output, const XTensor & gold, 
+                     int leadingDim = -1);
+
+/* compute the cross entropy loss with padding */
+XTensor CrossEntropy(const XTensor & output, const XTensor & gold,
+                     const XTensor & padding,
+                     int leadingDim = -1);
+
+/* compute the cross entropy loss with weight */
+XTensor CrossEntropyWeight(const XTensor & output, const XTensor & gold,
+                           const XTensor & weight,
+                           int leadingDim = -1);
+
+/* compute the cross entropy loss with weight and padding */
+XTensor CrossEntropyWeight(const XTensor & output, const XTensor & gold,
+                           const XTensor & padding, const XTensor & weight,
+                           int leadingDim = -1);
+
 /* compute the cross entropy loss (return the loss) */
 DTYPE _CrossEntropy(const XTensor * output, const XTensor * gold,
                    LOSS_COMPUTE_WAY reduceWay, const XTensor * weight = NULL, 

--- a/source/tensor/loss/LHeader.h
+++ b/source/tensor/loss/LHeader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ /*
+  * $Created by: Xu Chen (email: hello_master1954@163.com) 2019-4-22
+  */
+
+/* this is a header to include all loss computations in the "loss" workspace */
+
+#ifndef __LHEADER_H__
+#define __LHEADER_H__
+
+#include "CrossEntropy.h"
+
+#endif // __LHEADER_H__
\ No newline at end of file
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
@@ -30,7 +30,7 @@ In this case, 2 * (2, 1) -> (2, 2), dim=1.
 bool TestConcatenate1()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
@@ -157,7 +157,7 @@ In this case, 2 * (2, 1) -> (4, 1), dim=0.
 bool TestConcatenate2()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
@@ -286,7 +286,7 @@ In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 bool TestConcatenate3()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;

--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
@@ -31,7 +31,7 @@ In this case, 2 * (2, 1) -> (2, 2), dim=1.
 bool TestConcatenateSolely1()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
@@ -154,7 +154,7 @@ In this case, 2 * (2, 1) -> (4, 1), dim=0.
 bool TestConcatenateSolely2()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
@@ -279,7 +279,7 @@ In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 bool TestConcatenateSolely3()
 {
 	/* create list */
-    XList * sList = new XList();
+    TensorList * sList = new TensorList();

    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;

--- a/source/tensor/test/TCrossEntropy.h
+++ b/source/tensor/test/TCrossEntropy.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_CROSSENTROPY_H__
 #define __TEST_CROSSENTROPY_H__

-#include "../function/CrossEntropy.h"
+#include "../loss/CrossEntropy.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
@@ -246,7 +246,7 @@ In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
 bool TestMerge3()
 {
    /* create list */
-    XList * smallList = new XList();
+    TensorList * smallList = new TensorList();

    /* a small tensor of size (2, 4) */
    int sOrder = 2;
@@ -364,7 +364,7 @@ In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
 bool TestMerge4()
 {
    /* create list */
-    XList * smallList = new XList();
+    TensorList * smallList = new TensorList();

    /* a small tensor of size (2, 4) */
    int sOrder = 2;

--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
@@ -125,10 +125,10 @@ bool TestRectify2()
 	                         {1.0F, 1.0F, 1.0F} };
    DTYPE yAnswer[2][3] = { {1.0F, 1.0F, 2.0F},
 	                        {2.0F, 4.0F, 5.0F} };
-	DTYPE dedyAnswer[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                           {-0.5F, -0.25F, -0.2F} };
-	DTYPE dedxAnswer[2][3] = { {-1.0F, -1.0F, -0.5F},
-	                           {-0.5F, -0.25F, -0.2F} };
+	DTYPE dedyAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
+	                           {-0.25F, -0.125F, -0.1F} };
+	DTYPE dedxAnswer[2][3] = { {-0.5F, -0.5F, -0.25F},
+	                           {-0.25F, -0.125F, -0.1F} };

 	/* CPU test */
 	bool cpuTest = true;

--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
@@ -222,8 +222,8 @@ In this case, (3, 4) -> 2 * (3, 2) , whereToSplit=1, splitNum=2.
 bool TestSplit3()
 {
 	/* create list */
-    XList * tList = new XList();
-    XList tUserList;
+    TensorList * tList = new TensorList();
+    TensorList tUserList;

    /* a source tensor of size (3, 4) */
    int sOrder = 2;

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -35,7 +35,7 @@ bool Test()
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
    wrong = !TestCos() || wrong;
-    wrong = !TestConvertDataType() || wrong;
+    //wrong = !TestConvertDataType() || wrong;
    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
    wrong = !TestDiv() || wrong;