1. merge with liyinqiao branch 2. support read with float16 datatype

39fa0d25 · linye · 86adc288 · 39fa0d25 · 39fa0d25 · 39fa0d25
Commit 39fa0d25 authored Aug 01, 2019 by linye
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XNoder::MakeGrad(input);

    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _HardTanHBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _IdentityBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _RectifyBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _SigmoidBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -61,7 +61,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    if(operID == LOSS_CROSSENTROPY) {
        if (income.tailNum == 3) 
            padding = income.tails[2];
-         leadingDim = income.GetParamInt(0);
+        leadingDim = income.GetParamInt(0);
        CheckNTErrors(leadingDim >= 0 && leadingDim < output->order, "wrong leading dimension in logsoftmax!");
        _CrossEntropyBackward(dedy, output, gold, weight, padding, leadingDim);
    }
@@ -90,39 +90,39 @@ compute dE/dx for a given function y = f(x)
 >> params - parameters of the function
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-
-    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else{
-        ShowNTErrors("wrong function found when call the backward process!");
-    }
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
+//                        int funcID, void * params,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    CheckNTErrors(gold && y && x, "Empty input tensors!");
+//    CheckNTErrors(dedx, "Empty gradient tensors!");
+//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//
+//    if(funcID == FUNC_HARDTANH){
+//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_IDENTITY){
+//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_LOGSOFTMAX){
+//        int leadDim = *(int*)params;
+//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else if(funcID == FUNC_RECTIFY){
+//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_SIGMOID){
+//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+//    }else if(funcID == FUNC_SOFTMAX){
+//        int leadDim = *(int*)params;
+//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else{
+//        ShowNTErrors("wrong function found when call the backward process!");
+//    }
+//
+//}

 /* 
 compute dE/dy for variable y and error(loss) function E
@@ -131,19 +131,27 @@ compute dE/dy for variable y and error(loss) function E
 >> dedy - dE/dy
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    if(gold == NULL){
-        _SetDataFixed(dedy, 1.0F);
-        return;
-    }
-
-    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+//                        XTensor * dedy, XTensor * padding,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    if(gold == NULL){
+//        if(dedy->dataType == X_FLOAT)
+//            _SetDataFixedFloat(dedy, 1.0F);
+//        else if(dedy->dataType == X_DOUBLE)
+//            _SetDataFixedDouble(dedy, 1.0);
+//        else if(dedy->dataType == X_INT)
+//            _SetDataFixedInt(dedy, 1);
+//        else{
+//            ShowNTErrors("TODO");
+//        }
+//        return;
+//    }
+//
+//    //_LossBackward(dedy, gold, y, lossName);
+//    if(lossName == CROSSENTROPY)
+//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//
+//}

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -43,11 +43,11 @@ public:
    static
    bool IsLossOP(XTensor * node);

-    /* compute dE/dx for a given function y = f(x) */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -530,7 +530,7 @@ void XMathGrad::GradMatrixMul(XTensor * node, bool isEfficient)
    XTensor * dedc = node->grad;
    XTensor * deda = a->grad;
    XTensor * dedb = b->grad;
-    
+
    if(a->order == 2 && b->order == 2)
        GradMatrixMul(a, deda, transA, b, dedb, transB, dedc, alpha, isEfficient);
    else if(transA == X_NOTRANS && a->order > 2 && b->order == 2){
@@ -735,7 +735,7 @@ void XMathGrad::GradMultiply(XTensor * node, bool isEfficient)

    if (!isEfficient || b->isGrad) {
        XNoder::MakeGrad(b);
-        _Multiply(node->grad, a, b->grad, 1.0F);;
+        _Multiply(node->grad, a, b->grad, 1.0F);
    }

    node->visitMark = NODE_FINISHED;
@@ -765,15 +765,15 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)

    /* dE/da */
    _MultiplyDim(node->grad, b, a->grad, n, 1.0F);
-	
-	/* dE/db */
+    
+    /* dE/db */
    int order = a->order;
    int dimSize[MAX_TENSOR_DIM_NUM];
    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);

    XTensor * bGradTMP = NewTensorBuf(node->grad, node->devID, node->mem);
    _Multiply(node->grad, a, bGradTMP);
-	
+    
    if(n == order - 1){
        int reshapedSize[MAX_TENSOR_DIM_NUM];
        reshapedSize[0] = a->unitNum/dimSize[order - 1];
@@ -855,7 +855,6 @@ void XMathGrad::GradMultiplyBroadcast(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);

    XNoder::MakeGrad(a);
    _MultiplyBroadcast(node->grad, b, a->grad, 1.0F);
@@ -1079,91 +1078,91 @@ dE/db = - dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
 */
 void XMathGrad::GradSubDim(XTensor * node, bool isEfficient)
 {
-	XLink &income = node->income;
-	CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
-
-	XTensor * a = income.tails[0];
-	XTensor * b = income.tails[1];
-	int n = income.GetParamInt(0);
-	DTYPE beta = income.GetParam(1);
-	XNoder::MakeGrad(a);
-	XNoder::MakeGrad(b);
-
-	_Sum(a->grad, node->grad, a->grad);
-
-	int order = a->order;
-	int dimSize[MAX_TENSOR_DIM_NUM];
-	memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
-
-	if(n == order - 1){
-		int reshapedSize[MAX_TENSOR_DIM_NUM];
-		reshapedSize[0] = a->unitNum / dimSize[order - 1];
-		reshapedSize[1] = dimSize[order - 1];
-
-		/* we reshape dE/dc to a matrix whose column number is equal to the
-		   size of b. Then we can reduce the matrix into a row vector. */
-		node->grad->Reshape(2, reshapedSize);
-
-		//if(b->outgo.tailNum > 1){
-			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-			_ReduceSum(node->grad, bGradTMP, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(bGradTMP, beta);
-			_Sub(b->grad, bGradTMP, b->grad);
-			DelTensorBuf(bGradTMP);
-		/*}
-		else{
-			_ReduceSum(node->grad, b->grad, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(b->grad, beta);
-			_ScaleAndShiftMe(b->grad, -1.0F);
-		}*/
-
-		node->grad->Reshape(order, dimSize);
-	}
-	else{
-		int reshapedSize[MAX_TENSOR_DIM_NUM];
-		reshapedSize[0] = 1;
-		reshapedSize[1] = dimSize[n];
-		reshapedSize[2] = 1;
-
-		for(int i = 0; i < order; i++){
-			if(i < n)
-				reshapedSize[0] *= dimSize[i];
-		}
-
-		reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
-
-		/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
-		   Then reduce along with z and x to obtain dE/db. */
-		node->grad->Reshape(3, reshapedSize);
-
-		XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
-
-		_ReduceSum(node->grad, interGrad, 2);
-
-		//if(b->outgo.tailNum > 1){
-			XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
-			_ReduceSum(interGrad, bGradTMP, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(bGradTMP, beta);
-			_Sub(b->grad, bGradTMP, b->grad);
-			DelTensorBuf(bGradTMP);
-		/*}
-		else{
-			_ReduceSum(interGrad, b->grad, 0);
-			if(beta != 1.0F)
-				_ScaleAndShiftMe(b->grad, beta);
-			_ScaleAndShiftMe(b->grad, -1.0F);
-		}*/
-
-		node->grad->Reshape(order, dimSize);
-
-		DelTensorBuf(interGrad);
-
-	}
-
-	node->visitMark = NODE_FINISHED;
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUBDIM!");
+
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    int n = income.GetParamInt(0);
+    DTYPE beta = income.GetParam(1);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+
+    _Sum(a->grad, node->grad, a->grad);
+
+    int order = a->order;
+    int dimSize[MAX_TENSOR_DIM_NUM];
+    memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
+
+    if(n == order - 1){
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = a->unitNum / dimSize[order - 1];
+        reshapedSize[1] = dimSize[order - 1];
+
+        /* we reshape dE/dc to a matrix whose column number is equal to the
+           size of b. Then we can reduce the matrix into a row vector. */
+        node->grad->Reshape(2, reshapedSize);
+
+        //if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(node->grad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sub(b->grad, bGradTMP, b->grad);
+            DelTensorBuf(bGradTMP);
+        /*}
+        else{
+            _ReduceSum(node->grad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+            _ScaleAndShiftMe(b->grad, -1.0F);
+        }*/
+
+        node->grad->Reshape(order, dimSize);
+    }
+    else{
+        int reshapedSize[MAX_TENSOR_DIM_NUM];
+        reshapedSize[0] = 1;
+        reshapedSize[1] = dimSize[n];
+        reshapedSize[2] = 1;
+
+        for(int i = 0; i < order; i++){
+            if(i < n)
+                reshapedSize[0] *= dimSize[i];
+        }
+
+        reshapedSize[2] = a->unitNum / (reshapedSize[0] * reshapedSize[1]);
+
+        /* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
+           Then reduce along with z and x to obtain dE/db. */
+        node->grad->Reshape(3, reshapedSize);
+
+        XTensor * interGrad = NewTensorBuf(2, reshapedSize, b->dataType, b->denseRatio, b->devID, b->mem);
+
+        _ReduceSum(node->grad, interGrad, 2);
+
+        //if(b->outgo.tailNum > 1){
+            XTensor * bGradTMP = NewTensorBuf(b->grad, b->devID, b->mem);
+            _ReduceSum(interGrad, bGradTMP, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(bGradTMP, beta);
+            _Sub(b->grad, bGradTMP, b->grad);
+            DelTensorBuf(bGradTMP);
+        /*}
+        else{
+            _ReduceSum(interGrad, b->grad, 0);
+            if(beta != 1.0F)
+                _ScaleAndShiftMe(b->grad, beta);
+            _ScaleAndShiftMe(b->grad, -1.0F);
+        }*/
+
+        node->grad->Reshape(order, dimSize);
+
+        DelTensorBuf(interGrad);
+
+    }
+
+    node->visitMark = NODE_FINISHED;
 }

 /* 
@@ -1319,7 +1318,7 @@ void XMathGrad::GradSumBroadcast(XTensor * node, bool isEfficient)

    XTensor * a = income.tails[0];
    XTensor * b = income.tails[1];
-    DTYPE beta = income.GetParam(0);
+    //DTYPE beta = income.GetParam(0);

    XNoder::MakeGrad(a);
    _Sum(a->grad, node->grad, a->grad);

--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
@@ -146,10 +146,10 @@ private:
    static
    void GradSub(XTensor * node, bool isEfficient);
    
-	/* gradient for sub with one dimension: c = a - b * \beta
-	where the size of b is equal to that of one dimension of a */
-	static
-	void GradSubDim(XTensor * node, bool isEfficient);
+    /* gradient for sub with one dimension: c = a - b * \beta
+    where the size of b is equal to that of one dimension of a */
+    static
+    void GradSubDim(XTensor * node, bool isEfficient);

    /* gradient for sum: c =  a + b * \beta */
    static

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -271,8 +271,8 @@ void XShapeGrad::GradMergeList(XTensor * node, bool isEfficient)
    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");

    XTensor * last = NULL;
-    XList smalls(income.tailNum);
-    XList smallsGrad(income.tailNum);
+    TensorList smalls(income.tailNum);
+    TensorList smallsGrad(income.tailNum);
    bool mergeOnly = true;
    for(int i = 0; i < income.tailNum; i++){
        XTensor * tail = income.tails[i];
@@ -440,7 +440,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
    /* we compute the gradient for current node, rather than for
       child node, i.e., we use the outgoing edge here */
    XLink &outgo = node->outgo;
-    XList splits(outgo.tailNum);
+    TensorList splits(outgo.tailNum);
    int whereToSplit = -1;
    int splitNum = 0;

@@ -450,7 +450,7 @@ void XShapeGrad::GradSplitListPost(XTensor * node, bool isEfficient)
        if(income.typeID == SHAPE_SPLIT_LIST){
            int w = income.GetParamInt(0);
            int splitID = income.GetParamInt(1);
-			
+            
            if(whereToSplit < 0)
                whereToSplit = w;
            splitNum++;

--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
@@ -54,7 +54,7 @@ private:
    static
    void GradGather(XTensor * node, bool isEfficent);

-    /* gradient computation for dropout with indexs */
+    /* gradient computation for dropout with index: b = dropoutwithindex(a, index) */
    static
    void GradDropoutWithIndex(XTensor * node, bool isEfficent);


--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -37,16 +37,16 @@ struct XNet
    unsigned int id;

    /* tensor nodes of the network (in order) */
-    XList nodes;
+    TensorList nodes;

    /* tensor nodes to keep gradient for output (e.g., SGD)*/
-    XList gradNodes;
+    TensorList gradNodes;

    /* output nodes of the network */
-    XList outputs;
+    TensorList outputs;

    /* input nodes of the network */
-    XList inputs;
+    TensorList inputs;

    /* indicates whether the network just keeps the gradient for parameter tensors */
    bool isGradEfficient;
@@ -71,15 +71,15 @@ struct XNet

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds, XList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(TensorList &roots, TensorList &golds, TensorList &paddings, LOSS_FUNCTION_NAME loss = NOLOSS);

    /* backward computation for a given node */
    void BackwardNode(XTensor * node, bool isEfficent = false);
@@ -93,10 +93,10 @@ struct XNet

    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
-    void Traverse(XList &roots);
+    void Traverse(TensorList &roots);

    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
-    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    void TarjanVisit(XTensor * node, TensorList &orders, const unsigned int code);

    /* dump network information */
    void Dump(FILE * file);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -51,14 +51,12 @@ initialize the model
 >> myIgnored - number of position ignored in attention (from the begining)
 >> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
->> myMem - the memory pool
 */
 void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
-                             int myDevID, XMem * myMem)
+                             int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    isMasked = myIsMasked;
    ignored = myIgnored;
    
@@ -71,11 +69,11 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
+    InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
+    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);

    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
@@ -127,7 +125,7 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    XTensor q2;
    XTensor v2;
    XTensor kqv2;
-    XList split;
+    TensorList split;
    
    kqv2 = MMul(kqv, wbig);
    
@@ -135,9 +133,9 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    int d2 = kqv2.GetDim(1);
    int d3 = kqv2.GetDim(2) / 3;
    
-    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
    
    split.Add(&q2);
    split.Add(&k2);

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -42,9 +42,6 @@ public:
    /* device id */
    int devID;
    
-    /* memory pool */
-    XMem * mem;
-    
    /* head number */
    int nhead;

@@ -61,7 +58,7 @@ public:
    XTensor wa;
    
    XTensor wbig;
-	
+    
    /* size of transformed Q and K */
    int dk;

@@ -94,7 +91,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-25
+ * it is cold today but i'll move to a warm place tomorrow :)
+ */
+
+#ifndef __T2TBATCHLOADER_H__
+#define __T2TBATCHLOADER_H__
+
+#include "../../network/XNet.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+#define MAX_SEQUENCE_LENGTH 1024 * 4
+
+/* node to keep batch information */
+struct BatchNode
+{
+    /* begining position */
+    int beg;
+
+    /* end position */
+    int end;
+
+    /* maximum word number on the encoder side */
+    int maxEnc;
+
+    /* maximum word number on the decoder side */
+    int maxDec;
+
+    /* a key for sorting */
+    int key;
+};
+
+class T2TBatchLoader
+{
+public:
+    /* buffer for loading words */
+    int * buf;
+
+    /* another buffer */
+    int * buf2;
+
+    /* batch buf */
+    BatchNode * bufBatch;
+
+    /* buffer size */
+    int bufSize;
+
+    /* size of batch buffer */
+    int bufBatchSize;
+
+    /* length of each sequence */
+    int * seqLen;
+
+    /* another array */
+    int * seqLen2;
+
+    /* offset of the first word for each sequence */
+    int * seqOffset;
+
+    /* number of sequences in the buffer */
+    int nseqBuf;
+
+    /* offset for next sequence in the buffer */
+    int nextSeq;
+
+    /* offset for next batch */
+    int nextBatch;
+
+    /* indicates whether we double the </s> symbol for the output of lms */
+    bool isDoubledEnd;
+    
+    /* indicates whether we use batchsize = max * sc
+       rather rather than batchsize = word-number, where max is the maximum
+       length and sc is the sentence number */
+    bool isSmallBatch;
+
+    /* counterpart of "isSmallBatch" */
+    bool isBigBatch;
+
+    /* randomize batches */
+    bool isRandomBatch;
+
+    /* bucket size */
+    int bucketSize;
+
+public:
+    /* constructor */
+    T2TBatchLoader();
+
+    /* de-constructor */
+    ~T2TBatchLoader();
+
+    /* initialization */
+    void Init(int argc, char ** argv);
+
+    /* load data to buffer */
+    int LoadBuf(FILE * file, bool isSorted, int step);
+
+    /* clear data buffer */
+    void ClearBuf();
+
+    /* set the random batch flag */
+    void SetRandomBatch(bool flag = true);
+
+    /* load a batch of sequences */
+    int LoadBatch(FILE * file, bool isLM,
+                  XTensor * batchEnc, XTensor * paddingEnc, 
+                  XTensor * batchDec, XTensor * paddingDec,
+                  XTensor * gold, XTensor * label,
+                  int * seqs,
+                  int vsEnc, int vsDec, int sBatch, int wBatch, 
+                  bool isSorted, int &ws, int &wCount,
+                  int devID, bool isTraining);
+
+    /* load a batch of sequences (for language modeling) */
+    int LoadBatchLM(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc,
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vs, int sBatch, int wBatch, 
+                    bool isSorted, int &wCount,
+                    int devID, bool isTraining);
+
+    /* load a batch of sequences (for machine translation) */
+    int LoadBatchMT(FILE * file, 
+                    XTensor * batchEnc, XTensor * paddingEnc, 
+                    XTensor * batchDec, XTensor * paddingDec,
+                    XTensor * gold, XTensor * label,
+                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
+                    bool isSorted, int &ws, int &wCount,
+                    int devID, bool isTraining);
+
+    /* shuffle the data file */
+    void Shuffle(const char * srcFile, const char * tgtFile);
+};
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -31,6 +31,10 @@ namespace transformer
 /* constructor */
 AttDecoder::AttDecoder()
 {
+    attentions = NULL;
+    fnns = NULL;
+    attLayerNorms = NULL;
+    fnnLayerNorms = NULL;
    attentionsEnde = NULL;
    attEndeLayerNorms = NULL;
 }
@@ -38,6 +42,10 @@ AttDecoder::AttDecoder()
 /* de-constructor */
 AttDecoder::~AttDecoder()
 {
+    delete[] attentions;
+    delete[] fnns;
+    delete[] attLayerNorms;
+    delete[] fnnLayerNorms;
    delete[] attentionsEnde;
    delete[] attEndeLayerNorms;
 }
@@ -49,16 +57,14 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttDecoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);

    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;

    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -68,10 +74,10 @@ void AttDecoder::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "dropout", &dropoutP, 0);

    CheckNTErrors(nlayer >= 1, "We have one encoding layer at least!");
-    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");
+    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem, false);
+    embedder.InitModel(argc, argv, devID, false);

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -82,12 +88,12 @@ void AttDecoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }

@@ -159,6 +165,8 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    
+    x.SetName(DECODING_NAME);

    return x;
 }

--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -26,6 +26,9 @@

 namespace transformer
 {
+    
+#define DECODING_NAME "decoding"
+#define DECODING_INPUT_NAME "decoding_input"

 class AttDecoder
 {
@@ -34,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -92,7 +92,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the decoding network */
    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TEmbedder::T2TEmbedder()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    maxLength = -1;
 }
@@ -46,12 +45,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
 {
    devID = myDevID;
-    mem = myMem;
    
    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
@@ -64,7 +61,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, b
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    DTYPE v = 1.0F/(float)sqrt((float)eSize);
    w.SetDataRandn(0, v);
@@ -81,7 +78,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -145,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);

-        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -41,9 +41,6 @@ public:
    /* device id */
    int devID;
    
-    /* memory pool */
-    XMem * mem;
-    
    /* vocabulary size */
    int vSize;

@@ -71,7 +68,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
+    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -52,15 +52,12 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
->> myMem - the memory pool
-*/
+>> myDevID - device id*/
 void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -73,7 +70,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem);
+    embedder.InitModel(argc, argv, devID);

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -82,10 +79,10 @@ void AttEncoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }

@@ -139,6 +136,9 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
        /* layer normalization */
        x = fnnLayerNorms[i].Make(res);
    }
+    
+    x.SetName(ENCODING_NAME);
+    input.SetName(ENCODING_INPUT_NAME);

    return x;
 }

--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -32,6 +32,9 @@ using namespace nts;

 namespace transformer
 {
+    
+#define ENCODING_NAME "encoding"
+#define ENCODING_INPUT_NAME "encoding_input"

 /* 
 base class of the encoder 
@@ -62,9 +65,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -115,7 +115,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the encoding network */
    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -47,12 +47,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    
    float minmax = 0;

@@ -62,11 +60,11 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    float scale = 1.0F;
    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -36,9 +36,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* size of input vector */
    int inSize;

@@ -72,7 +69,7 @@ public:
    ~T2TFNN();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input, bool isTraining);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem = NULL;
    d = 0;
 }

@@ -46,18 +45,16 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID, mem);
-    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);

    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();

--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -36,9 +36,6 @@ class T2TLN
 public:
    /* device id */
    int devID;
-    
-    /* memory pool */
-    XMem * mem;

    /* the transformation matrix w */
    XTensor w;
@@ -57,7 +54,7 @@ public:
    ~T2TLN();
    
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../../tensor/core/CHeader.h"
+#include "T2TLengthPenalty.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* 
+GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+where n = length of the sequence 
+>> length - length of the sequence (for each entry)
+>> alpha - the parameter controls the length preference
+<< return - length penaltyof the sequence (for each entry)
+*/
+XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
+{
+    XTensor base;
+    XTensor lp;
+
+    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
+    base = (length + 5)/(1 + 5);
+
+    lp = Power(base, alpha);
+    
+    return lp;
+}
+
+}
--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
 /* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-04-08
+ * Start of a new week - I just finished several documents.
+ * Writing document is harder than writing code :)
+ */

-#ifndef __SUMBYCOLUMNTV_H__
-#define __SUMBYCOLUMNTV_H__
+#ifndef __T2TLENGTHPENALTY_H__
+#define __T2TLENGTHPENALTY_H__

-#include "../../XTensor.h"
+#include "../../tensor/XTensor.h"

-namespace nts { // namespace nts(NiuTrans.Tensor)
+using namespace nts;

-/* sum of a tensor and a (column) vector */
-void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+namespace transformer
+{

-} // namespace nts(NiuTrans.Tensor)
+/* We intend to penalize short sequences because they have higher score
+   in product of a sequence of probability-like terms and have more chances
+   to beat others in search. */
+class T2TLengthPenalizer
+{
+public:
+    /* GNMT-like length penalty: pl = ((5 + n)/(5 + 1))^\alpha 
+       where n = length of the sequence */
+    static
+    XTensor GNMT(const XTensor & length, float alpha);
+};

-#endif // __SUMBYCOLUMNTV_H__
+}
+
+#endif
--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -40,9 +40,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the encoder */
    AttEncoder * encoder;

@@ -98,7 +95,7 @@ public:
                       XTensor &maskDec, XTensor &maskEncDec);

    /* get parameter matrics */
-    void GetParams(XList &list);
+    void GetParams(TensorList &list);

    /* dump the parameters */
    void Dump(const char * fn);

--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TOutput::T2TOutput()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    inSize = -1;
    hSize = -1;
@@ -47,12 +46,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    float minmax = 0;

@@ -61,7 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
    
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));
@@ -95,6 +92,7 @@ void T2TOutput::Make(XTensor &input, XTensor &output)

    //output = LogSoftmax(MMul(x, w), -1);
    output = Softmax(MMul(x, w), -1);
+    output.SetName(OUTPUT_NAME);
 }

 }
--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -28,6 +28,8 @@ using namespace nts;

 namespace transformer
 {
+    
+#define OUTPUT_NAME "output"

 /* output layer */
 class T2TOutput
@@ -36,9 +38,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* vocabulary size */
    int vSize;

@@ -59,7 +58,7 @@ public:
    ~T2TOutput();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ */
+
+#include "T2TPredictor.h"
+#include "../../tensor/core/CHeader.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* constructor */
+T2TStateBundle::T2TStateBundle()
+{
+    states = NULL;
+    isStart = false;
+}
+
+/* de-constructor */
+T2TStateBundle::~T2TStateBundle()
+{
+    if(states != NULL)
+        delete[] states;
+}
+
+/* 
+create states 
+>> num - number of states
+*/
+void T2TStateBundle::MakeStates(int num)
+{
+    CheckNTErrors(num > 0, "invalid number");
+
+    if(states != NULL)
+        delete[] states;
+
+    states = new T2TState[num];
+
+    for(int i = 0; i < num; i++){
+        states[i].prediction = -1;
+        states[i].pid = T2T_PID_EMPTY;
+        states[i].isEnd = false;
+        states[i].isStart = false;
+        states[i].isCompleted = false;
+        states[i].prob = 0;
+        states[i].probPath = 0;
+        states[i].modelScore = 0;
+        states[i].nstep = 0;
+        states[i].last = NULL;
+    }
+
+    stateNum = num;
+}
+
+/* constructor */
+T2TPredictor::T2TPredictor()
+{
+    startSymbol = -1;
+}
+
+/* de-constructor */
+T2TPredictor::~T2TPredictor()
+{
+}
+
+/* 
+create an initial state 
+>> model - the t2t model
+>> top - the top-most layer of the network
+>> input - input of the network
+>> beamSize - beam size
+>> state - the state to be initialized
+*/
+void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state)
+{
+    state->layersEnc.Clear();
+    state->layersDec.Clear();
+
+    XTensor * encoding = XLink::SearchNode(top, ENCODING_NAME);
+    CheckNTErrors(encoding != NULL, "No encoding layers found!");
+
+    state->layersEnc.Add(encoding);
+    state->layersDec.Add(NULL);
+
+    int dims[MAX_TENSOR_DIM_NUM];
+    for (int i = 0; i < input->order - 1; i++)
+        dims[i] = input->GetDim(i);
+    dims[input->order - 1] = beamSize;
+
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);
+
+    state->probPath.SetZeroAll();
+    state->nstep.SetZeroAll();
+    state->endMark.SetZeroAll();
+
+    state->stateNum = 0;
+}
+
+/*
+set start symbol
+>> symbol - the symbol (in integer)
+*/
+void T2TPredictor::SetStartSymbol(int symbol)
+{
+    startSymbol = symbol;
+}
+
+/* 
+read a state 
+>> model - the t2t model that keeps the network created so far
+>> state - a set of states. It keeps
+             1) hypotheses (states)
+             2) probablities of hypotheses
+             3) parts of the network for expanding toward the next state
+*/
+void T2TPredictor::Read(T2TModel * model, T2TStateBundle * state)
+{
+    m = model;
+    s = state;
+}
+
+/*
+predict the next state
+>> next - next states (assuming that the current state has been read)
+>> encoding - encoder output
+>> inputEnc - input of the encoder
+>> paddingEnc - padding of the encoder
+*/
+void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
+                           XTensor * inputEnc, XTensor * paddingEnc)
+{
+    int dims[MAX_TENSOR_DIM_NUM];
+
+    next->layersEnc.Clear();
+    next->layersDec.Clear();
+    
+    AttDecoder &decoder = *m->decoder;
+    
+    /* word indices of previous positions */
+    XTensor * inputLast = (XTensor*)s->layersDec.GetItem(0);
+
+    /* word indices of positions up to next state */
+    XTensor inputDec;
+
+    /* the first token */
+    XTensor first;
+    
+    CheckNTErrors(inputEnc->order >= 2, "Wrong order of the tensor!");
+    for(int i = 0; i < inputEnc->order - 1; i++)
+        dims[i] = inputEnc->GetDim(i);
+    dims[inputEnc->order - 1] = 1;
+
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
+    _SetDataFixed(&first, startSymbol);
+
+    /* add a new word into the input sequence of the decoder side */
+    if (inputLast == NULL) {
+        inputDec = Identity(first);
+    }
+    else{
+        inputDec = GeneratePaths(s);
+        inputDec.SetDevice(inputEnc->devID);
+
+        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
+    }
+
+    /* prediction probabilities */
+    XTensor &output = next->prob;
+    XTensor decoding;
+    XTensor decodingStep;
+    
+    for(int i = 0; i < inputDec.order - 1; i++)
+        dims[i] = inputDec.GetDim(i);
+    dims[inputDec.order - 1] = inputDec.GetDim(-1);
+    
+    XTensor paddingDec;
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    SetDataFixed(paddingDec, 1);
+    
+    XTensor maskDec;
+    XTensor maskEncDec;
+    
+    /* decoder mask */
+    m->MakeMTMaskDec(*inputEnc, inputDec, *paddingEnc, paddingDec, maskDec, maskEncDec);
+
+    /* make the decoding network */
+    decoding = decoder.Make(inputDec, *encoding, maskDec, maskEncDec, false);
+
+    XTensor selectSrc;
+    XTensor selectTgt;
+
+    CheckNTErrors(decoding.order >= 2, "The tensor must be of order 2 or larger!");
+
+    int stride = decoding.GetDim(decoding.order - 2);
+
+    InitTensor1DV2(&selectSrc, 1, X_INT);
+    InitTensor1DV2(&selectTgt, 1, X_INT);
+
+    selectSrc.SetInt(stride - 1, 0);
+    selectTgt.SetInt(0, 0);
+
+    selectSrc.SetDevice(decoding.devID);
+    selectTgt.SetDevice(decoding.devID);
+    
+    /* the decoder output of the last position */
+    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
+
+    /* generate the output probabilities */
+    m->outputLayer->Make(decodingStep, output);
+    
+    next->layersEnc.AddList(&s->layersEnc);
+    next->layersDec.Add(&inputDec);
+    next->layersDec.Add(&output);
+}
+
+/* 
+generate paths up to the states of the current step 
+>> state - state bundle of the current step
+*/
+XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
+{
+    CheckNTErrors(state->stateNum >= 0, "Illegal state!");
+
+    int distance = -1;
+    
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+
+        while(cur != NULL){
+            nsteps++;
+            cur = cur->last;
+        }
+
+        if(nsteps > distance)
+            distance = nsteps;
+    }
+
+    XTensor path;
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
+    path.SetZeroAll();
+
+    for(int i = 0; i < state->stateNum; i++){
+        T2TState * cur = state->states + i;
+        int nsteps = 0;
+
+        while(cur != NULL){
+            nsteps++;
+            path.Set2DInt(cur->prediction, i, distance - nsteps);
+            cur = cur->last;
+        }
+    }
+
+    return path;
+}
+
+}
+
--- a/source/sample/transformer/T2TPredictor.h
+++ b/source/sample/transformer/T2TPredictor.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-13
+ * This is the first source file I create in 2019 - new start!
+ */
+
+#ifndef __T2TPREDICTOR_H__
+#define __T2TPREDICTOR_H__
+
+#include "T2TModel.h"
+#include "T2TLengthPenalty.h"
+
+namespace transformer
+{
+
+#define T2T_PID_EMPTY -1
+
+/* state for search. It keeps the path (back-pointer), prediction distribution,
+   and etc. It can be regarded as a hypothsis in translation. */
+class T2TState
+{
+public:
+    /* we assume that the prediction is an integer */
+    int prediction;
+
+    /* id of the problem. One can regard it as the sentence id when we 
+       translate a number of sentences in the batched manner. The hypothesis 
+       is empty if id = -1 */
+    int pid;
+
+    /* indicates whether the state is an end */
+    bool isEnd;
+
+    /* indicates whether the state is the start */
+    bool isStart;
+
+    /* indicates whether the state is completed */
+    bool isCompleted;
+
+    /* probability of every prediction (last state of the path) */
+    float prob;
+
+    /* probability of every path */
+    float probPath;
+
+    /* model score of every path. A model score = path probability + some other stuff */
+    float modelScore;
+
+    /* nubmer of steps we go over so far */
+    int nstep;
+
+    /* pointer to the previous state */
+    T2TState * last;
+};
+
+/* a bundle of states */
+class T2TStateBundle
+{
+public:
+    /* predictions */
+    XTensor prediction;
+    
+    /* id of the previous state that generates the current one  */
+    XTensor preID;
+
+    /* mark that indicates whether each hypothesis is completed */
+    XTensor endMark;
+
+    /* probability of every prediction (last state of the path) */
+    XTensor prob;
+
+    /* probability of every path */
+    XTensor probPath;
+
+    /* model score of every path */
+    XTensor modelScore;
+
+    /* step number of each hypothesis */
+    XTensor nstep;
+
+    /* layers on the encoder side. We actually use the encoder output instead
+       of all hidden layers. */
+    TensorList layersEnc;
+
+    /* layers on the decoder side */
+    TensorList layersDec;
+
+    /* list of states */
+    T2TState * states;
+
+    /* number of states */
+    int stateNum;
+
+    /* indicates whether it is the first state */
+    bool isStart;
+
+public:
+    /* constructor */
+    T2TStateBundle();
+
+    /* de-constructor */
+    ~T2TStateBundle();
+
+    /* create states */
+    void MakeStates(int num);
+};
+
+/* The predictor reads the current state and then predicts the next. 
+   It is exactly the same procedure of MT inference -
+   we get the state of previous words and then generate the next word.
+   Here, a state can be regared as the representation of words (word 
+   indices, hidden states, embeddings and etc.).  */
+class T2TPredictor
+{
+private:
+    /* pointer to the transformer model */
+    T2TModel * m;
+
+    /* current state */
+    T2TStateBundle * s;
+
+    /* start symbol */
+    int startSymbol;
+
+public:
+    /* constructor */
+    T2TPredictor();
+
+    /* de-constructor */
+    ~T2TPredictor();
+
+    /* create an initial state */
+    void Create(T2TModel * model, XTensor * top, const XTensor * input, int beamSize, T2TStateBundle * state);
+
+    /* set the start symbol */
+    void SetStartSymbol(int symbol);
+
+    /* read a state */
+    void Read(T2TModel * model, T2TStateBundle * state);
+
+    /* predict the next state */
+    void Predict(T2TStateBundle * next, XTensor * encoding, XTensor * inputEnc, XTensor * paddingEnc);
+
+    /* generate paths up to the states of the current step */
+    XTensor GeneratePaths(T2TStateBundle * state);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+
+#ifndef __T2TSEARCH_H__
+#define __T2TSEARCH_H__
+
+#include "T2TModel.h"
+#include "T2TPredictor.h"
+
+namespace transformer
+{
+
+/* The class orgnizes the search process. It calls "predictors" to generate
+   distributions of the predictions and prunes the search space by beam pruning.
+   This makes a graph where each path respresents a translation hypothsis.
+   The output can be the path with the highest model score. */
+class T2TSearch
+{
+private:
+    /* the alpha parameter controls the length preference */
+    float alpha;
+
+    /* predictor */
+    T2TPredictor predictor;
+    
+    /* max length of the generated sequence */
+    int maxLength;
+    
+    /* beam size */
+    int beamSize;
+
+    /* batch size */
+    int batchSize;
+
+    /* we keep the final hypotheses in a heap for each sentence in the batch. */
+    XHeap<MIN_HEAP, float> * fullHypos;
+
+    /* array of the end symbols */
+    int * endSymbols;
+
+    /* number of the end symbols */
+    int endSymbolNum;
+
+    /* start symbol */
+    int startSymbol;
+
+public:
+    /* constructor */
+    T2TSearch();
+
+    /* de-constructor */
+    ~T2TSearch();
+    
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+
+    /* search for the most promising states */
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);
+
+    /* preparation */
+    void Prepare(int myBatchSize,int myBeamSize);
+
+    /* compute the model score for each hypothesis */
+    void Score(T2TStateBundle * prev, T2TStateBundle * beam);
+
+    /* generate token indices via beam pruning */
+    void Generate(T2TStateBundle * beam);
+
+    /* expand the search graph */
+    void Expand(T2TStateBundle * prev, T2TStateBundle * beam);
+
+    /* collect hypotheses with ending symbol */
+    void Collect(T2TStateBundle * beam);
+
+    /* fill the hypotheis heap with incomplete hypothses */
+    void FillHeap(T2TStateBundle * beam);
+
+    /* save the output sequences in a tensor */
+    void Dump(XTensor * output);
+
+    /* check if the token is an end symbol */
+    bool IsEnd(int token);
+
+    /* set end symbols for search */
+    void SetEnd(const int * tokens, const int tokenNum);
+
+    /* make a mask to prevent duplicated entries in beam expansion for the first position */
+    XTensor MakeFirstMask(T2TStateBundle * beam);
+};
+
+}
+
+#endif
--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ */
+
+#include <math.h>
+#include "T2TUtility.h"
+#include "T2TTester.h"
+#include "T2TSearch.h"
+#include "../../tensor/XUtility.h"
+#include "../../tensor/core/CHeader.h"
+#include "../../network/XNoder.h"
+
+using namespace nts;
+
+namespace transformer
+{
+
+/* constructor */
+T2TTester::T2TTester()
+{
+}
+
+/* de-constructor */
+T2TTester::~T2TTester()
+{
+}
+
+/* initialize the model */
+void T2TTester::Init(int argc, char ** argv)
+{
+    LoadParamInt(argc, argv, "vsize", &vSize, 1);
+    LoadParamInt(argc, argv, "vsizetgt", &vSizeTgt, vSize);
+
+    batchLoader.Init(argc, argv);
+    seacher.Init(argc, argv);
+}
+
+/* 
+test the model
+>> fn - test data file
+>> ofn - output data file
+>> model - model that is trained
+*/
+void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
+{
+    int wc = 0;
+    int ws = 0;
+    int wordCount = 0;
+    int wordCountTotal = 0;
+    int sentCount = 0;
+    int batchCount = 0;
+    float loss = 0;
+
+    /* data files */
+    FILE * file = fopen(fn, "rb");
+    CheckNTErrors(file, "Cannot read the test file");
+    FILE * ofile = fopen(ofn, "wb");
+    CheckNTErrors(ofile, "Cannot open the output file");
+
+    int devID = model->devID;
+
+    XNet net;
+    
+    double startT = GetClockSec();
+        
+    wordCount = 0;
+        
+    /* batch of input sequences */
+    XTensor batchEnc;
+    XTensor batchDec;
+
+    /* label */
+    XTensor label;
+
+    /* padding */
+    XTensor paddingEnc;
+    XTensor paddingDec;
+
+    /* gold standard */
+    XTensor gold;
+
+    /* an array that keeps the sequences */
+    int * seqs = new int[MILLION];
+
+    batchLoader.SetRandomBatch(false);
+    batchLoader.ClearBuf();
+
+    while(batchLoader.LoadBatch(file, model->isLM, 
+                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
+                                seqs, vSize, vSizeTgt,
+                                1, 1, false, ws, wc, devID, false))
+    {
+        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
+        CheckNTErrors(!model->isLM, "Only MT model is supported!");
+        
+        XTensor output;
+
+        seacher.Search(model, &batchEnc, &paddingEnc, &output);
+
+        Dump(ofile, &output);
+
+        float prob = 0;
+            
+        loss += -prob;
+        wc = batchEnc.GetDim(-1);
+        wordCount += wc;
+        wordCountTotal += wc;
+        sentCount += batchEnc.GetDim(-2);
+        batchCount += 1;
+
+        if (batchCount % 1 == 0) {
+            double elapsed = GetClockSec() - startT;
+            XPRINT3(0, stderr, 
+                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
+                    elapsed, sentCount, wordCount);
+        }
+    }
+        
+    fclose(file);
+    fclose(ofile);
+
+    delete[] seqs;
+    
+    double elapsed = GetClockSec() - startT;
+
+    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
+            elapsed,wordCountTotal, exp(loss/wordCount));
+}
+
+/*
+dump the result into the file
+>> file - data file
+>> output - output tensor
+*/
+void T2TTester::Dump(FILE * file, XTensor * output)
+{
+    int seqLength = output->GetDim(-1);
+
+    for (int i = 0; i < output->unitNum; i += seqLength) {
+        for (int j = 0; j < seqLength; j++) {
+            int w = output->GetInt(i + j);
+            fprintf(file, "%d ", w);
+            if (w < 0)
+                break;
+        }
+
+        fprintf(file, "\n");
+    }
+}
+
+}
--- a/source/sample/transformer/T2TTester.h
+++ b/source/sample/transformer/T2TTester.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2019-03-27
+ * A week with no trips :)
+ */
+
+#ifndef __T2TTESTER_H__
+#define __T2TTESTER_H__
+
+#include "T2TSearch.h"
+#include "T2TBatchLoader.h"
+
+namespace transformer
+{
+
+/* This class translates test sentences with a trained model. */
+class T2TTester
+{
+public:
+    /* vocabulary size of the source side */
+    int vSize;
+
+    /* vocabulary size of the target side */
+    int vSizeTgt;
+    
+    /* for batching */
+    T2TBatchLoader batchLoader;
+
+    /* decoder for inference */
+    T2TSearch seacher;
+
+public:
+    /* constructor */
+    T2TTester();
+
+    /* de-constructor */
+    ~T2TTester();
+
+    /* initialize the model */
+    void Init(int argc, char ** argv);
+
+    /* test the model */
+    void Test(const char * fn, const char * ofn, T2TModel * model);
+
+    /* dump the result into the file */
+    void Dump(FILE * file, XTensor * output);
+};
+
+}
+
+#endif
\ No newline at end of file
--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
--- a/source/sample/transformer/T2TTrainer.h
+++ b/source/sample/transformer/T2TTrainer.h
@@ -23,35 +23,14 @@
 #define __T2TTRAINER_H__

 #include "T2TModel.h"
-
+#include "T2TBatchLoader.h"
 #include "../../tensor/function/FHeader.h"

-#define MAX_SEQUENCE_LENGTH 1024 * 4
-
 using namespace nts;

 namespace transformer
 {

-/* node to keep batch information */
-struct BatchNode
-{
-    /* begining position */
-    int beg;
-
-    /* end position */
-    int end;
-
-    /* maximum word number on the encoder side */
-    int maxEnc;
-
-    /* maximum word number on the decoder side */
-    int maxDec;
-
-    /* a key for sorting */
-    int key;
-};
-
 /* trainer of the T2T model */
 class T2TTrainer
 {
@@ -61,42 +40,6 @@ public:

    /* parameter array */
    char ** argArray;
-
-    /* buffer for loading words */
-    int * buf;
-
-    /* another buffer */
-    int * buf2;
-
-    /* batch buf */
-    BatchNode * bufBatch;
-
-    /* buffer size */
-    int bufSize;
-
-    /* size of batch buffer */
-    int bufBatchSize;
-
-    /* length of each sequence */
-    int * seqLen;
-
-    /* another array */
-    int * seqLen2;
-
-    /* offset of the first word for each sequence */
-    int * seqOffset;
-
-    /* number of sequences in the buffer */
-    int nseqBuf;
-
-    /* offset for next sequence in the buffer */
-    int nextSeq;
-
-    /* offset for next batch */
-    int nextBatch;
-    
-    /* indicates whether the sequence is sorted by length */
-    bool isLenSorted;
    
    /* dimension size of each inner layer */
    int d;
@@ -139,10 +82,10 @@ public:
    float adamBeta2T;

    /* list of the moment of the parameter matrics */
-    XList moments;
+    TensorList moments;

    /* list of the 2nd order moment of the parameter matrics */
-    XList moments2nd;
+    TensorList moments2nd;

    /* indicates whether the data file is shuffled for training */
    bool isShuffled;
@@ -158,26 +101,15 @@ public:
    
    /* number of batches on which we do model update */
    int updateStep;
-    
-    /* indicates whether we double the </s> symbol for the output of lms */
-    bool isDoubledEnd;
-    
-    /* indicates whether we use batchsize = max * sc
-       rather rather than batchsize = word-number, where max is the maximum
-       length and sc is the sentence number */
-    bool isSmallBatch;
-
-    /* counterpart of "isSmallBatch" */
-    bool isBigBatch;
-
-    /* randomize batches */
-    bool isRandomBatch;

    /* indicates whether we intend to debug the net */
    bool isDebugged;

-    /* bucket size */
-    int bucketSize;
+    /* indicates whether the sequence is sorted by length */
+    bool isLenSorted;
+
+    /* for batching */
+    T2TBatchLoader batchLoader;

 public:
    /* constructor */
@@ -197,46 +129,6 @@ public:

    /* make a checkpoint */
    void MakeCheckpoint(T2TModel * model, const char * validFN, const char * modelFN, const char * label, int id);
-
-    /* load data to buffer */
-    int LoadBuf(FILE * file, bool isSorted, int step);
-
-    /* clear data buffer */
-    void ClearBuf();
-
-    /* load a batch of sequences */
-    int LoadBatch(FILE * file, bool isLM,
-                  XTensor * batchEnc, XTensor * paddingEnc, 
-                  XTensor * batchDec, XTensor * paddingDec,
-                  XTensor * gold, XTensor * label,
-                  int * seqs,
-                  int vsEnc, int vsDec, int sBatch, int wBatch, 
-                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
-
-    /* load a batch of sequences (for language modeling) */
-    int LoadBatchLM(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc,
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vs, int sBatch, int wBatch, 
-                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* load a batch of sequences (for machine translation) */
-    int LoadBatchMT(FILE * file, 
-                    XTensor * batchEnc, XTensor * paddingEnc, 
-                    XTensor * batchDec, XTensor * paddingDec,
-                    XTensor * gold, XTensor * label,
-                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
-                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
-
-    /* shuffle the data file */
-    void Shuffle(const char * srcFile, const char * tgtFile);
    
    /* get word probabilities for a batch of sequences */
    float GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs);

--- a/source/sample/transformer/Transformer.cpp
+++ b/source/sample/transformer/Transformer.cpp
@@ -25,6 +25,8 @@
 #include "T2TModel.h"
 #include "T2TUtility.h"
 #include "T2TTrainer.h"
+#include "T2TPredictor.h"
+#include "T2TTester.h"
 #include "../../tensor/XDevice.h"
 #include "../../tensor/XUtility.h"
 #include "../../tensor/XGlobal.h"
@@ -47,6 +49,7 @@ int TransformerMain(int argc, const char ** argv)

    ShowParams(argc, args);

+    bool isBeamSearch = false;
    char * trainFN = new char[MAX_LINE_LENGTH];
    char * modelFN = new char[MAX_LINE_LENGTH];
    char * testFN = new char[MAX_LINE_LENGTH];
@@ -56,6 +59,7 @@ int TransformerMain(int argc, const char ** argv)
    LoadParamString(argc, args, "model", modelFN, "");
    LoadParamString(argc, args, "test", testFN, "");
    LoadParamString(argc, args, "output", outputFN, "");
+    LoadParamBool(argc, args, "beamsearch", &isBeamSearch, false);

    srand((unsigned int)time(NULL));

@@ -64,28 +68,35 @@ int TransformerMain(int argc, const char ** argv)

    T2TModel model;
    model.InitModel(argc, args);
-
-    //if(strcmp(modelFN, ""))
-    //    model.Read(modelFN);
    
    /* learn model parameters */
    if(strcmp(trainFN, ""))
        trainer.Train(trainFN, testFN, strcmp(modelFN, "") ? modelFN : "checkpoint.model", &model);
    
    /* save the final model */
-    //if(strcmp(modelFN, "") && strcmp(trainFN, ""))
-        //model.Dump(modelFN);
+    if(strcmp(modelFN, "") && strcmp(trainFN, ""))
+        model.Dump(modelFN);
    
    /* load the model if neccessary */
-    //if(strcmp(modelFN, ""))
-        //model.Read(modelFN);
-
-    T2TTrainer tester;
-    tester.Init(argc, args);
+    if(strcmp(modelFN, ""))
+        model.Read(modelFN);

    /* test the model on the new data */
-    if(strcmp(testFN, "") && strcmp(outputFN, ""))
-        tester.Test(testFN, outputFN, &model);
+    if(strcmp(testFN, "") && strcmp(outputFN, "")){
+        /* beam search */
+        if(isBeamSearch){
+            T2TTester searcher;
+            searcher.Init(argc, args);
+            searcher.Test(testFN, outputFN, &model);
+        }
+
+        /* forced decoding */
+        else{
+            T2TTrainer tester;
+            tester.Init(argc, args);
+            tester.Test(testFN, outputFN, &model);
+        }
+    }

    delete[] trainFN;
    delete[] modelFN;

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -274,7 +274,7 @@ void T2TTest2()
    //XTensor * probs = NewTensor(3, dimSize, X_FLOAT, 1.0F, -1);

    //myRead(probs, "probs.txt", " ");
-    _SetDataFixedFloat(probs, 1.0F);
+    _SetDataFixed(probs, 1.0F);

    probs->Reshape(1, probs->unitNum);


--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -60,7 +60,7 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
    }
 }

-/****************************************************
+/*
 Below is for calling CPU BLAS for fast matrix operations
 I'm not sure how fast it is. But it seems that other
 guys are crazy about this. So I decided to have a try.
@@ -81,35 +81,4 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
    return f;
 }

-/* 
-data type conversion
->> devID - device id
->> s - source data array
->> typeS - source data type
->> t - target data array
->> typeT - target data type
->> size - number of the items in s (and t)
-*/
-void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
-{
-    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
-
-    if(typeS == typeT)
-        return;
-
-    if(typeS == X_FLOAT && typeT == X_FLOAT16){
-        for(int i = 0; i < size; i++){
-            ((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
-        }
-    }
-    else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
-        for(int i = 0; i < size; i++){
-            ((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
-        }
-    }
-    else{
-        ShowNTErrors("Unsupported data types for conversion!");
-    }
-}
-
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -49,15 +49,6 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-void ConvertDataType(int devID, 
-                     void * s, TENSOR_DATA_TYPE typeS, 
-                     void * t, TENSOR_DATA_TYPE typeT, int size);
-
-#ifdef USE_CUDA
-void CudaConvertDataType(int devID, 
-                         void * s, TENSOR_DATA_TYPE typeS, 
-                         void * t, TENSOR_DATA_TYPE typeT, int size);
-#endif

 } /* end of the nts (NiuTrans.Tensor) namespace */


--- a/source/tensor/XDevice.cpp
+++ b/source/tensor/XDevice.cpp
@@ -201,7 +201,8 @@ void XDevice::SetGPUDevice(int devID)
    cudaError_t error = cudaSetDevice(devID);

    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaSetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }
 #else
@@ -216,7 +217,7 @@ void XDevice::SetGPUDeviceFast(int devID)
    SetFastFlags();
 }

-/* switch to a get current dev */
+/* get the id of the current GPU device */
 int XDevice::GetGPUDevice()
 {
 #ifdef USE_CUDA
@@ -224,7 +225,8 @@ int XDevice::GetGPUDevice()
    cudaError_t error = cudaGetDevice(&devID);

    if (error != cudaSuccess){
-        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n", devID, error, cudaGetErrorString(error));
+        fprintf(stderr, "Error! Calling cudaGetDevice(%d) fails(%d:%s)\n",
+                devID, error, cudaGetErrorString(error));
        exit(1);
    }

@@ -248,7 +250,7 @@ void XDevice::SetFastFlags()
 #endif
 }

-/* reset cuda flag for more efficient cuda execution (all devices) */
+/* reset the cuda flag for more efficient cuda execution (all devices) */
 void XDevice::SetFastFlagsAllDevices()
 {
 #ifdef USE_CUDA
@@ -266,10 +268,6 @@ XDevManager::XDevManager()
 {
    Clear();
    Init();
-
-#ifndef USE_CPP11
-    fprintf(stderr, "Warning!!! c++ 11 is RECOMMENDED for compilation.\n");
-#endif
 }

 /* de-constructor */
@@ -278,7 +276,7 @@ XDevManager::~XDevManager()
 }


-/* initialize it and get the CPU and GPU information */
+/* initialization */
 void XDevManager::Init()
 {
    srand((unsigned int)time(NULL));
@@ -322,7 +320,7 @@ void XDevManager::Clear()

 #ifdef USE_CUDA

-/* get the handle of GPU */
+/* get the handle of a given GPU */
 cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -330,7 +328,7 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
    return GPUs[devID].GetCublasHandle();
 }

-/* get the stream of cuda */
+/* get the stream of a given GPU */
 cudaStream_t * XDevManager::GetCudaStream(const int devID)
 {
    CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
@@ -478,7 +476,7 @@ split a string
 >> items - splitting result
 << return - how many items are there
 */
-int SplitALine(char * inputString, const char * seperator, XList * items)
+int SplitALine(char * inputString, const char * seperator, StrList* items)
 {
    items->Clear();

@@ -527,12 +525,12 @@ get device ids for the given device information
             devInfo = "0:CPU-1 1:GPU-0 2:CPU-1"
             means that the first device is CPU, the second device
             is GPU-0, the third device is CPU.
->> devIDs - device sequence specified by devInfo
+>> devIDs - device IDs specified by devInfo
 << return - number of devices
 */
 int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
 {
-    XList * terms = new XList(1);
+	StrList* terms = new StrList(1);
    SplitALine(devInfo, " ", terms);

    for(int i = 0; i < terms->count; i++){
@@ -569,7 +567,7 @@ int XDevManager::GetDeviceIDs(char * devInfo, int * devIDs)
    return devCount;
 }

-/* show id sequence */
+/* show device IDs */
 void XDevManager::ShowDeviceIDs(char * devInfo, char * msg)
 {
    msg[0] = 0;

--- a/source/tensor/XDevice.h
+++ b/source/tensor/XDevice.h
@@ -236,6 +236,18 @@ extern XDevManager GDevs;
        cudaSetDevice(devIDBackup); \
 } \

+#define CheckDev(a, b) \
+{ \
+    if((a < 0 && b >= 0) || (a >= 0 && b < 0)){ \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+    else if (a >= 0 && b >= 0 && a != b) { \
+        fprintf(stderr, "[ERROR] (%s line %d): we must run the code on the same device (%d vs %d)\n", __FILENAME__, __LINE__, a, b); \
+        exit(1); \
+    } \
+} \
+
 } /* end of the nts (NiuTrans.Tensor) namespace */

 #endif
--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -51,7 +51,13 @@ bool CONST_TRUE = true;

 int verboseLevel = 0;
 bool useBLAS = false;
-bool useCUDA = false;
+
+
+#ifdef USE_CUDA
+    bool useCUDA = true;
+#else
+    bool useCUDA = false;
+#endif

 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -45,10 +45,6 @@ typedef int8_t __int8;
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

-#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
-#define USE_CPP11
-#endif
-
 #define _XINLINE_  

 //#define DOUBELPRICSION
@@ -159,7 +155,9 @@ extern bool useCUDA;
 #define XPRINT7(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7);FFLUSH(FILEH);}}
 #define XPRINT8(VERBOSE,FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8) {if(VERBOSE<=verboseLevel) {fprintf(FILEH,STR,ARG,ARG2,ARG3,ARG4,ARG5,ARG6,ARG7,ARG8);FFLUSH(FILEH);}}

-#define B2I(V) V==0?false:true
+#define B2I(V) V == 0 ? false : true
+
+#define MODX(a, b) int(b == 0 ? a : a - floor(double(a)/b) * b)

 /* BLAS interfaces */
 #ifdef DOUBELPRICSION

--- a/source/tensor/XHeap.cpp
+++ b/source/tensor/XHeap.cpp
@@ -31,15 +31,15 @@ namespace nts{

 /* constructor */
 template<HeapType hType, typename T>
+XHeap<hType, T>::XHeap()
+{
+}
+
+/* constructor */
+template<HeapType hType, typename T>
 XHeap<hType, T>::XHeap(int mySize, XMem * myMem)
 {
-    mem = myMem;
-    size = mySize;
-    count = 0;
-    if (mem == NULL)
-        items = new HeapNode<T>[mySize];
-    else
-        mem->Alloc(mem->devID, mySize * sizeof(T));
+    Init(mySize, myMem);
 }

 /* deconstructor */
@@ -50,6 +50,19 @@ XHeap<hType, T>::~XHeap()
 }

 template<HeapType hType, typename T>
+void XHeap<hType, T>::Init(int mySize, XMem * myMem)
+{
+    mem = myMem;
+    size = mySize;
+    count = 0;
+
+    if (mem == NULL)
+        items = new HeapNode<T>[mySize];
+    else
+        mem->Alloc(mem->devID, mySize * sizeof(T));
+}
+
+template<HeapType hType, typename T>
 void XHeap<hType, T>::Clear(T initValue)
 {
    count = 0;
@@ -89,10 +102,24 @@ _XINLINE_ HeapNode<T> XHeap<hType, T>::End()
 template<HeapType hType, typename T>
 _XINLINE_ void XHeap<hType, T>::Push(HeapNode<T> node)
 {
-    //CheckNTErrors((count < size), "Heap is full!");
-    items[count] = node;
-    Up(count);
-    count++;
+    if (count < size) {
+        items[count] = node;
+        Up(count);
+        count++;
+    }
+    else if(count == size){
+        HeapNode<T> & item0 = items[0];
+        if (hType == MIN_HEAP && item0.value >= node.value)
+            return;
+        else if (hType == MAX_HEAP && item0.value <= node.value)
+            return;
+        items[0] = node;
+        Down(0);
+    }
+    else {
+        ShowNTErrors("Overflow of the heap!");
+    }
+    
 }

 /* replace the top-most item and update the heap */
@@ -107,7 +134,7 @@ _XINLINE_ void XHeap<hType, T>::ReplaceTop(HeapNode<T> node)
 template<HeapType hType, typename T>
 _XINLINE_ HeapNode<T> XHeap<hType, T>::Pop()
 {
-    //CheckNTErrors((size > 0), "Empty heap!");
+    CheckNTErrors(count > 0, "Empty heap!");
    HeapNode<T> node = items[0];
    items[0] = items[count - 1];
    count--;

--- a/source/tensor/XHeap.h
+++ b/source/tensor/XHeap.h
@@ -39,7 +39,7 @@ template <typename T>
 struct HeapNode
 {
    /* node index */
-    int index;
+    long long index;

    /* value of the node */
    T value;
@@ -52,9 +52,16 @@ struct HeapNode

    HeapNode(int i, T v)
    {
-        index = i;
+        index = (long long)i;
        value = v;
    };
+
+    HeapNode(void * i, T v)
+    {
+        index = (long long)i;
+        value = v;
+
+    }
 };

 /* a heap that keeps a data array of T */
@@ -76,11 +83,17 @@ public:

 public:
    /* constructor */
+    XHeap();
+
+    /* constructor */
    XHeap(int mySize, XMem * myMem = NULL);

    /* deconstructor */
    ~XHeap();

+    /* initialization */
+    void Init(int mySize, XMem * myMem = NULL);
+
    /* clear the data */
    void Clear(T initValue);

@@ -107,6 +120,9 @@ public:

    /* move item k up the tree */
    void Up(int k);
+
+    /* how many items are kept in the heap */
+    inline int Count() { return count; };
 };

 } /* end of the nts (NiuTrans.Tensor) namespace */

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -300,9 +300,9 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id
    if(h == NULL)
        return;
    
-    XList list(2);
-    list.Add(t1);
-    list.Add(t2);
+    TensorList list(2);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);

    MakeLink(&list, h, id);
 }
@@ -320,10 +320,10 @@ void XLink::MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3,
    if (h == NULL)
        return;

-    XList list(3);
-    list.Add(t1);
-    list.Add(t2);
-    list.Add(t3);
+    TensorList list(3);
+    list.Add((XTensor*)t1);
+    list.Add((XTensor*)t2);
+    list.Add((XTensor*)t3);

    MakeLink(&list, h, id);
 }
@@ -334,7 +334,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(const XList * list, XTensor * h, int id)
+void XLink::MakeLink(const TensorList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -368,7 +368,7 @@ create a hyper edge with a input tensors and a list of output tensors
 >> list - a list of output tensors
 >> id - id of the edge type
 */
-void XLink::MakeLink(XTensor * t, XList * list, int id)
+void XLink::MakeLink(XTensor * t, TensorList * list, int id)
 {
    /* forward */
    for(int i = 0; i < list->count; i++){
@@ -624,7 +624,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
    ClearIncoming(target);

    int tailNum = reference->income.tailNum;
-    XList tails(tailNum);
+    TensorList tails(tailNum);
    for(int i = 0; i < tailNum; i++){
        XTensor * tail = (XTensor*)reference->income.tails[i];
        tails.Add(tail);
@@ -743,7 +743,7 @@ search for a node in a top-down manner by its name
 >> top - the top most node
 << return - the node we found
 */
-/*XTensor * XLink::SearchNode(XTensor * top, const char * name)
+XTensor * XLink::SearchNode(XTensor * top, const char * name)
 {
    if(!strcmp(top->name, name))
        return top;
@@ -758,7 +758,7 @@ search for a node in a top-down manner by its name
    }

    return NULL;
-}*/
+}

    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -138,17 +138,17 @@ struct XLink
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);

-    /* create a hyper edge with two input tensors and a output tensor */
+    /* create a hyper edge with three input tensors and a output tensor */
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, const XTensor * t3, XTensor * h, int id);

    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(const XList * list, XTensor * h, int id);
+    void MakeLink(const TensorList * list, XTensor * h, int id);

    /* create a hyper edge with a input tensors and a list of output tensors */
    static
-    void MakeLink(XTensor * h, XList * list, int id);
+    void MakeLink(XTensor * h, TensorList * list, int id);

    /* add a parameter */
    static
@@ -191,8 +191,8 @@ struct XLink
    void ShowNode(FILE * file, XTensor * node);

    /* search a node in a top-down manner by its name */
-    //static
-    //XTensor * SearchNode(XTensor * top, const char * name);
+    static
+    XTensor * SearchNode(XTensor * top, const char * name);
 };
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
 /* NiuTrans.Tensor - an open-source tensor library
- * Copyright (C) 2017, Natural Language Processing Lab, Northestern University. 
+ * Copyright (C) 2019, Natural Language Processing Lab, Northestern University.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,32 +15,31 @@
 * limitations under the License.
 */

-/*
- * 
- * Implementation of list that keeps data items
- *
- * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-04-17
- * The first coding job this year!
- *
- */
-
-#ifndef __XLIST_H__
-#define __XLIST_H__
+ /*
+  *
+  * Implementation of template list that keeps data items
+  *
+  * $Created by: HU Chi (huchinlp@foxmail.com)
+  *
+  */

 #include "XMem.h"
 #include "XGlobal.h"

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+#ifndef __TensorList_H__
+#define __TensorList_H__

-typedef int (* ListCompare)(const void * item1, const void * item2);

-/* the XList class */
-class XList
-{
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts {
+    
+/* the TensorListBase class */
+template <typename T>
+struct TensorListBase {
 public:
+
    /* data items */
-    void ** items;
+    T *items;

    /* number of items */
    int count;
@@ -49,56 +48,88 @@ public:
    int maxNum;

    /* the memory pool for data array allocation */
-    XMem * mem;
-
-    /* indicates whether data items are integers */
-    bool isIntList;
+    XMem* mem;

 public:
    /* constructor */
-    XList();
+    TensorListBase();

    /* constructor */
-    XList(int myMaxNum, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum);

    /* constructor */
-    XList(int myMaxNum, XMem * myMem, bool isIntListOrNot = false);
+    TensorListBase(int myMaxNum, XMem* myMem);

    /* de-constructor */
-    ~XList();
-
-    /* utilities */
-    void Create(int myMaxNum, XMem * myMem);
-    void Add(const void * item);
-    void Add(void ** inputItems, int inputItemCount);
-    void AddList(XList * l);
-    void AddInt(int i);
-    void Insert(int pos, void * item);
-    void * GetItem(int i) const;   
-    int GetItemInt(int i);
-    void SetItem(int i, void * item);
-    void SetItemInt(int i, int item);
-    
-    int FindFirst(void * item);
+    ~TensorListBase();
+
+    /* add an item into the list */
+    void Add(T&& item);
+
+    /* add an item into the list */
+    void Add(const T& item);
+
+    /* add a number of items into the list */
+    void Add(T* inputItems, int inputItemCount);
+
+    /* append a list to the current list */
+    void AddList(TensorListBase* l);
+
+    /* insert an item to the given position of the list */
+    void Insert(int pos, const T& item);
+
+    /* insert an item to the given position of the list */
+    void Insert(int pos, T&& item);
+
+    /* get the item at position i */
+    T& GetItem(int i) const;
+
+    /* set the item at position i */
+    void SetItem(int i, const T& item);
+
+    /* set the item at position i */
+    void SetItem(int i, T&& item);
+
+    /* find the position of the first matched item  */
+    int FindFirst(const T& item);
+
+    /* clear the data array */
    void Clear();
-    void ClearStringList();
-    void Sort(int itemSize, ListCompare comp);
+
+    /* sort the list */
+    void Sort(int itemSize);
+
+    /* reverse the list */
    void Reverse();
+
+    /* remove the item at position i */
    void Remove(int i);
-    XList * Copy(XMem * myMem);
+
+    /* copy the list */
+    TensorListBase* Copy(XMem* myMem);
+
+    /* shuffle the list */
    void Shuffle(int nround = 10, int beg = -1, int len = 0);

    /* short */
-    _XINLINE_ void * Get(int i) {return GetItem(i);};
-    _XINLINE_ int GetInt(int i) {return GetItemInt(i);};
-    _XINLINE_ void Set(int i, void * item) {SetItem(i, item);};
-    _XINLINE_ void SetInt(int i, int item) {SetItemInt(i, item);};
-
+    T& operator[] (int i) {
+        return GetItem(i);
+    };
+    T& Get(int i) { return GetItem(i); };
+    void Set(int i, T item) { SetItem(i, item); };
 };

-extern XList NULLList;
+struct XTensor;
+
+typedef TensorListBase<int> IntList;
+typedef TensorListBase<char> CharList;
+typedef TensorListBase<char*> StrList;
+typedef TensorListBase<long> LongList;
+typedef TensorListBase<float> FloatList;
+typedef TensorListBase<short> ShortList;
+typedef TensorListBase<void*> XList;
+typedef TensorListBase<XTensor*> TensorList;

-} 
-/* end of the nts (NiuTrans.Tensor) namespace */
+} /* end of the nts (NiuTrans.Tensor) namespace */

-#endif
+#endif // __TensorList_H__
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -34,6 +34,11 @@ namespace nts{
 int testxmemid = 0;
 void * recordp = NULL;

+/*
+for managing the memories
+*/
+XMemManager GMems;
+
 XMem * GMem;

 /* constructor */
@@ -48,6 +53,7 @@ XMem::XMem()
    strcpy(name, "xmem");
    signature = 0;
    mergeFreeOTF = true;
+    isInitialized = false;
 }

 /* 
@@ -58,7 +64,7 @@ constructor
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -103,7 +109,7 @@ initialize it
 >> myMode - mode of running the memory pool
            UNI_FREE: free all the space at the end of using the memory pool
            FREE_ON_THE_FLY: normal "malloc" and "free" mode
->> myBlockSize - size of memory block
+>> myBlockSize - size of a memory block
 >> myBlockNum  - number of memory blocks
 >> myBufSize - size of buffer
 */
@@ -164,6 +170,7 @@ void XMem::Initialize(int myDevID, MEMPOOL_MODE myMode, MTYPE myBlockSize, int m
 #endif

    signature++;
+    isInitialized = true;
 }

 /* free memory */
@@ -216,9 +223,9 @@ void XMem::Free(int myDevID, void * mem)
    }
 }

-/* 
-get signature 
-<< return - return the signature
+/*
+get the signature
+<< return - the signature
 */
 MTYPE XMem::GetSignature()
 {
@@ -226,7 +233,7 @@ MTYPE XMem::GetSignature()
 }

 /* 
-use string as the name of the memory pool 
+set the name of the memory pool 
 >> myName - name of the memory pool
 */
 void XMem::SetName(const char * myName)
@@ -259,7 +266,7 @@ void XMem::SetDevice(int myDevID)
 }

 /* 
-switch to the device (with fast cuda execution mode) we want to work 
+switch to the device (with fast cuda execution mode) we intend to work on
 >> myDevID - device id(-1: CPU memory, >=0: GPU device ID)
 */
 void XMem::SetDeviceFast(int myDevID)
@@ -275,7 +282,7 @@ void XMem::SetDeviceFast(int myDevID)
 }

 /* 
-run in static mode 
+run in the static mode
 >> myIsStatic - specify if the memory allocation is static
 */
 void XMem::SetStaticMode(bool myIsStatic)
@@ -300,7 +307,7 @@ void XMem::SetComputationMode(bool myIsForComputation)
        cublasDestroy(cublasHandle);
    if(myIsForComputation)
        CheckNTErrors((enum curandStatus)cublasCreate(&cublasHandle) == CURAND_STATUS_SUCCESS, 
-				      "Cannot create the cublas handle.");
+                      "Cannot create the cublas handle.");

    SetDevice(devIDBackup);
 #endif
@@ -316,11 +323,11 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
 {
    delete[] memIndex;
    delete[] memIndex2;
-	delete[] minSizeIndex;
+    delete[] minSizeIndex;

-	nodeNum = indexSize;
-	nodeNumUsed = minSizeNum * 2;
-	indexEntryNum = minSizeNum;
+    nodeNum = indexSize;
+    nodeNumUsed = minSizeNum * 2;
+    indexEntryNum = minSizeNum;
    
    memIndex = new MPieceNode[nodeNum];
    memset(memIndex, 0, sizeof(MPieceNode) * nodeNum);
@@ -328,12 +335,12 @@ void XMem::SetIndex(INT_64 indexSize, MTYPE minSizeFirst, int minSizeNum)
    memIndex2 = new MPieceNode[nodeNum];
    memset(memIndex2, 0, sizeof(MPieceNode) * nodeNum);

-	minSizeIndex = new MTYPE[indexEntryNum];
-	memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);
+    minSizeIndex = new MTYPE[indexEntryNum];
+    memset(minSizeIndex, 0, sizeof(MTYPE) * indexEntryNum);

-	minSizeIndex[0] = minSizeFirst;
-	for(int i = 1; i < indexEntryNum; i++)
-		minSizeIndex[i] = minSizeIndex[i - 1] * 2;
+    minSizeIndex[0] = minSizeFirst;
+    for(int i = 1; i < indexEntryNum; i++)
+        minSizeIndex[i] = minSizeIndex[i - 1] * 2;

    indexOffset = GetMSB(minSizeFirst);
 }
@@ -752,8 +759,8 @@ void * XMem::AllocStandard(int myDevID, MTYPE mySize, bool myIsRebuiltIndex)

    /* if all index nodes are used, we rebuild the index to release the nodes that are free */
    if(nodeNumUsed == nodeNum){
-    	RebuildIndex();
-    	CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
+        RebuildIndex();
+        CheckNTErrors(nodeNumUsed < nodeNum, "No enough index nodes for the memory pool!");
    }

    /*if(testxmemid == 30){
@@ -956,8 +963,8 @@ release a piece of memory as "free"
 */
 void XMem::ReleaseStandard(int myDevID, void * p, MTYPE size)
 {
-	if(p == NULL)
-		return;
+    if(p == NULL)
+        return;
    
    if(size <= minSizeIndex[0])
        size = minSizeIndex[0];
@@ -1087,7 +1094,7 @@ void XMem::RebuildIndex()
            block->mem = NULL;
        }
        else{
-        	/* if the block is in use, we build the index */
+            /* if the block is in use, we build the index */
            int pieceCount = 0;
            MTYPE size = 0;
            MHeader * newLast = NULL;
@@ -1488,4 +1495,179 @@ cublasHandle_t * XMem::GetCublasHandle()

 #endif

+/* constructor */
+XMemManager::XMemManager()
+{
+    Initialize();
+}
+
+/* de-constructor */
+XMemManager::~XMemManager()
+{
+}
+
+/* get memory size */
+MTYPE XMemManager::GetAvailableMemory()
+{
+    unsigned long freeMem = 0;
+#if __APPLE__
+    int mib[2] = {CTL_HW, HW_MEMSIZE};
+    unsigned int namelen = sizeof(mib) / sizeof(mib[0]);
+    unsigned long long size;
+    size_t len = sizeof(size);
+    if (sysctl(mib, namelen, &size, &len, NULL, 0) < 0){
+        ShowNTErrors("Cannot get memory size on Mac!");
+    }
+    else{
+        return size;
+    }
+#elif _WIN32
+    MEMORYSTATUSEX memoryStatus;
+    memoryStatus.dwLength = sizeof(memoryStatus);
+    if (GlobalMemoryStatusEx(&memoryStatus)){
+        freeMem = memoryStatus.ullAvailPhys;
+    }
+#else
+    long pages = sysconf(_SC_AVPHYS_PAGES);
+    long page_size = sysconf(_SC_PAGE_SIZE);
+    freeMem = pages * page_size;
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get GPU memory size */
+MTYPE XMemManager::GetAvailableGPUMemory(int devID)
+{
+    size_t freeMem = 0;
+    
+#ifdef USE_CUDA
+    size_t totalMem = 0;
+    cudaSetDevice(devID);
+    if (cudaMemGetInfo(&freeMem, &totalMem) != cudaSuccess){
+        XPRINT(0, stderr, "cannot get GPU memory information.");
+        exit(1);
+    }
+#endif
+    return (MTYPE)freeMem;
+}
+
+/* get buffer size */
+void XMemManager::GetBufferSize(MTYPE freeMem, MTYPE * myBufSize)
+{
+    *myBufSize = 0;
+    if (freeMem >= MILLION * 128){
+        *myBufSize = MILLION * 32;
+        if (freeMem >= MILLION * 256){
+            *myBufSize = MILLION * 64;
+            if (freeMem >= MILLION * 512){
+                *myBufSize = MILLION * 128;
+                if (freeMem >= MILLION * 1024) {
+                    *myBufSize = MILLION * 256;
+                    if (freeMem >= MILLION * 2048)
+                        *myBufSize = MILLION * 512;
+                }
+            }
+        }
+    }
+} 
+
+/* initialize it and set the global memory information */
+void XMemManager::Initialize()
+{
+    srand((unsigned int)time(NULL));
+
+    Free();
+    
+    /* CPUs (we actually do not care about how many CPUs are using) */
+    nCPUMem = 1;
+
+    /* GPUs */
+    nGPUMem = 0;
+
+#ifdef USE_CUDA
+    if (cudaGetDeviceCount(&nGPUMem) != cudaSuccess) {
+        XPRINT(0, stderr, "cannot get GPU information.");
+        exit(1);
+    }
+#endif
+
+}
+
+/* free it */
+void XMemManager::Free()
+{
+    for (int i = 0; i < MAX_CPU_MEM_NUM; i++)
+        CPUMems[i].Free();
+    for (int i = 0; i < MAX_GPU_MEM_NUM; i++)
+        GPUMems[i].Free();
+}
+
+/* get global memory pool */
+XMem * XMemManager::GetMem(const int devID)
+{
+    XMem * mem = NULL;
+    if (devID < 0){
+        if(!CPUMems[0].isInitialized){
+            MTYPE freeMem = GetAvailableMemory();
+            MTYPE myBufSize = 0;
+            GetBufferSize(freeMem, &myBufSize);
+            CPUMems[0].Initialize(-1, FREE_ON_THE_FLY, 
+                                  MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                  MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                  myBufSize);
+        }
+        mem = CPUMems;
+    }
+    else{
+        if (devID < nGPUMem){
+            if(!GPUMems[devID].isInitialized){
+                MTYPE freeMem = GetAvailableGPUMemory(devID);
+                MTYPE myBufSize = 0;
+                GetBufferSize(freeMem, &myBufSize);
+                GPUMems[devID].Initialize(devID, FREE_ON_THE_FLY, 
+                                          MIN_BLOCK_SIZE_FOR_MEMPOOL, 
+                                          MIN_BLOCK_NUM_FOR_MEMPOOL, 
+                                          myBufSize);
+            }
+            mem = GPUMems + devID;
+        }
+        else{
+            XPRINT1(0, stderr, "Cannot get the memory (%d). Please check your device id!", devID);
+        }
+    }
+    
+    return mem;
+}
+
+/* get global memory size */
+int XMemManager::GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize)
+{
+    XMem * mem = GetMem(devID);
+    int result = 0;
+    if (mem != NULL){
+        *myBlockSize = mem->maxBlockSize;
+        *myBlockNum = mem->blockNum;
+        *myBufSize = mem->bufSize;
+        result = 1;
+    }
+    return result;
+}
+
+/* show memory information */
+void XMemManager::ShowMemInfo()
+{
+    XPRINT(1, stderr, "Memory Information:\n");
+    MTYPE myBlockSize, myBufSize;
+    int myBlockNum;
+    for(int i = 0; i < nCPUMem; i++){
+        GetMemSize(-1, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT3(1, stderr, " - id:-1 CPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", myBlockSize, myBlockNum, myBufSize);
+    }
+
+    for(int i = 0; i < nGPUMem; i++){
+        GetMemSize(i, &myBlockSize, &myBlockNum, &myBufSize);
+        XPRINT4(1, stderr, " - id:%2d GPU, blockSize:%lld, blockNum:%d, bufSize:%lld\n", i, myBlockSize, myBlockNum, myBufSize);
+    }
+}
+
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -39,6 +39,15 @@
 #include <curand.h>
 #endif

+#ifdef __APPLE__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif WIN32
+#include <windows.h>
+#else
+#include <unistd.h>
+#endif
+
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{

@@ -51,8 +60,10 @@ typedef long long          INT_64;
 #define CUDA_HOST_MALLOC 1
 #define MY_PITCH CUDA_PITCH
 #define BUF_PITCH 256
-#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
+#define MIN_BLOCK_SIZE_FOR_MEMPOOL 256 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
+#define MAX_CPU_MEM_NUM 16
+#define MAX_GPU_MEM_NUM 16

 /* 
 mode of runnig a memory pool 
@@ -202,6 +213,9 @@ public:
    MTYPE curUsedPin;
    MTYPE bufUsedPin;

+    /* indicates whether the memory pool is initialized */
+    bool isInitialized;
+
 #ifdef USE_CUDA
    /* handle used for cublas */
    cublasHandle_t cublasHandle;
@@ -413,6 +427,61 @@ public:

 };

+/*
+a class for the management of memory
+*/
+class XMemManager
+{
+private:
+    /* cpu memory pool information */
+    XMem CPUMems[MAX_CPU_MEM_NUM];
+
+    /* number of cpu memory pools */
+    int nCPUMem;
+
+    /* gpu memory pool information */
+    XMem GPUMems[MAX_GPU_MEM_NUM];
+
+    /* number of gpu memory pools */
+    int nGPUMem;
+
+public:
+    /* constructor */
+    XMemManager();
+
+    /* de-constructor */
+    ~XMemManager();
+
+    /* get memory size */
+    MTYPE GetAvailableMemory();
+
+    /* get GPU memory size */
+    MTYPE GetAvailableGPUMemory(int devID);
+
+    /* get buffer size */
+    void GetBufferSize(MTYPE freeMem, MTYPE * myBufSize);
+
+    /* initialize it and set the global memory information */
+    void Initialize();
+
+    /* free it */
+    void Free();
+
+    /* get global memory pool */
+    XMem * GetMem(const int devID);
+
+    /* get global memory size */
+    int GetMemSize(const int devID, MTYPE * myBlockSize, int * myBlockNum, MTYPE * myBufSize);
+
+    /* show memory information */
+    void ShowMemInfo();
+};
+
+/* managing the memories */
+extern XMemManager GMems;
+
+
+
 extern XMem * GMem;

 extern int testxmemid;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -59,6 +59,8 @@ const char * GetOPName(int type)
            return "M_DIV";
        else if (type == MATH_DIVDIM)
            return "M_DIVDIM";
+        else if (type == MATH_MASK)
+            return "M_MASK";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
@@ -108,7 +110,7 @@ const char * GetOPName(int type)
        else if (type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
    }
-	else if ((type & DATA_BASE) != 0) {
+		else if ((type & DATA_BASE) != 0) {
 		if (type == GETANDSET_CONVERTDATATYPE)
 			return "G_CONVERTDATATYPE";
 		else if (type == GETANDSET_INDEXTOONEHOT)
@@ -118,8 +120,10 @@ const char * GetOPName(int type)
 		else if (type == GETANDSET_SELECT)
 			return "G_SELECT";
 	}
-	else if ((type & SHAPE_BASE) != 0) {
-        if (type == MOVEMENT_COPYINDEXED)
+    else if ((type & SHAPE_BASE) != 0){
+        if (type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -48,7 +48,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
-#define MATH_MATRIXMUL          MATH_DIVDIM + 1
+#define MATH_MASK               MATH_DIVDIM + 1
+#define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
@@ -85,6 +86,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1

 #define SHAPE_BASE              DATA_BASE * 2
+
 #define MOVEMENT                SHAPE_BASE + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1

--- a/source/tensor/XPRunner.cpp
+++ b/source/tensor/XPRunner.cpp
@@ -146,7 +146,7 @@ run a set of jobs in parallel
 >> jobArgs - the list of arguments for each job
 >> sleepTime - time to sleep (in ms) for each round
 */
-void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
+void XPRunner::Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime)
 {
    if(threadNum <= 0){
        XPRINT(1, stderr, "Error! No threads were created!\n");
@@ -195,7 +195,7 @@ void XPRunner::Run(XList * jobFunctions, XList * jobArgs, float sleepTime)
            TFunction function = (TFunction)jobFunctions->GetItem(jobArgs->count - c);

            /* the arguments that are passed to the function */
-            volatile XList * args = (XList*)jobArgs->GetItem(jobArgs->count - c);
+            volatile TensorList * args = (TensorList*)jobArgs->GetItem(jobArgs->count - c);

            /* thread */
            XThread * thread  = threads + availableThreads[i];

--- a/source/tensor/XPRunner.h
+++ b/source/tensor/XPRunner.h
@@ -106,7 +106,7 @@ public:
    void KillThreads();

    /* run a set of jobs in parallel */
-    void Run(XList * jobFunctions, XList * jobArgs, float sleepTime = 0);
+    void Run(TensorList * jobFunctions, TensorList * jobArgs, float sleepTime = 0);

    /* get the number of parallel jobs to run */
    int GetJobNum(int size);

--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
@@ -42,7 +42,7 @@ job item used in queues
 JobQueueNode::JobQueueNode()
 {
    job  = NULL;
-    args = new XList(1);
+    args = new TensorList(1);
 }

 /* de-constructor */
@@ -67,7 +67,7 @@ XQueue::XQueue(int mySize)
    head = 0;
    tail = 0;
    isJobQueue = false;
-    jobDequeuerArgs = new XList(1);
+    jobDequeuerArgs = new TensorList(1);
    jobDequeuerBreak = false;
    runningJobCount = 0;
    jobStream = NULL;
@@ -188,8 +188,10 @@ void XQueue::RunJobConsumer(int jobDevID)

    isJobQueue = true;
    jobDequeuerArgs->Clear();
-    jobDequeuerArgs->Add(this);
-    jobDequeuerArgs->Add(jobDevID >= 0 ? devids + jobDevID : &cpuid);
+
+    // warning: this may cause unknown error
+    jobDequeuerArgs->Add((XTensor*)this);
+    jobDequeuerArgs->Add(jobDevID >= 0 ? (XTensor*)(devids + jobDevID) : (XTensor*)&cpuid);

    jobDequeuer.function = (TFunction)DequeueJobs;
    jobDequeuer.argv = jobDequeuerArgs;
@@ -211,7 +213,7 @@ void XQueue::StopJobConsumer()
 }

 /* add a job item to process */
-void XQueue::EnqueueJob(void * job, XList * jobArgs)
+void XQueue::EnqueueJob(void * job, TensorList * jobArgs)
 {
    MUTEX_LOCK(jobQueueMutex);
    runningJobCount++;
@@ -225,7 +227,7 @@ void XQueue::EnqueueJob(void * job, XList * jobArgs)
 }

 /* job item consumer */
-void XQueue::DequeueJobs(XList * args)
+void XQueue::DequeueJobs(TensorList * args)
 {
    CheckNTErrors((args->count == 2), "Illegal arguments!");


--- a/source/tensor/XQueue.h
+++ b/source/tensor/XQueue.h
@@ -52,7 +52,7 @@ public:
    void * job;

    /* arguments of the job */
-    XList * args;
+    TensorList * args;

 public:
    /* constructor */
@@ -102,7 +102,7 @@ private:
    XThread jobDequeuer;

    /* argument list of jobDequeuer */
-    XList * jobDequeuerArgs;
+    TensorList * jobDequeuerArgs;

    /* indicates whether jobDequeuer stops */
    bool jobDequeuerBreak;
@@ -141,11 +141,11 @@ public:
    void StopJobConsumer();

    /* add a job item to process */
-    void EnqueueJob(void * job, XList * jobArgs);
+    void EnqueueJob(void * job, TensorList * jobArgs);

    /* job item consumer */
    static
-    void DequeueJobs(XList * args);
+    void DequeueJobs(TensorList * args);

    /* get the break flag */
    bool GetJobBreak();

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -85,7 +85,7 @@ namespace nts{

 #endif

-typedef void (*TFunction) (volatile XList*);
+typedef void (*TFunction) (volatile TensorList*);

 /*
 This is a class that wraps the standard implementation of threading
@@ -133,7 +133,7 @@ public:

    /* arguments (for the function to run) */
    volatile
-    XList * argv;
+    TensorList * argv;

    /* a flag to break */
    volatile

--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -36,13 +36,9 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
-#include "arithmetic/SumByColumnTV.h"
-#include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
 #include "arithmetic/MulAndShift.h"
@@ -56,7 +52,6 @@
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"
-#include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"


--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "Div.h"
 #include "Div.cuh"
 #include "DivDim.h"
@@ -41,12 +42,15 @@ where i is the index of the item
 */
 void _Div(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");

+    CheckDev(a->devID, b->devID);
+
+    int leadingDimRDI = a->order - leadingDim - 1;
+
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaDiv(a, b, c, alpha, leadingDim);
@@ -138,6 +142,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Div(a, b, a, alpha, leadingDim);
 }

+/*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
+
 /* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor
@@ -225,9 +246,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -241,7 +261,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _Div function */
        _Div(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIV);
            XLink::AddParamToHead(&c, alpha);
@@ -252,7 +272,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
@@ -229,4 +229,4 @@ void _CudaDiv(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, in

 #endif // USE_CUDA

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i)
 where i is the index of the element
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -19,10 +19,12 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
 */

+#include <math.h>
 #include "Div.h"
 #include "DivDim.h"
 #include "DivDim.cuh"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,6 +44,8 @@ i.e., a is divided with b by broadcasting
 */
 void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha)
 {
+    n = MODX(n, a->order);
+
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in division!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -50,6 +54,8 @@ void _DivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alp
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");

+    CheckDev(a->devID, b->devID);
+
    if(XTensor::IsSameShaped(a, b)){
        _Div(a, b, c, alpha);
        return;
@@ -151,6 +157,8 @@ XTensor DivDim(const XTensor &a, const XTensor &b, int n, DTYPE alpha)
 {
    XTensor c(&a);
    c.SetTMPFlag();
+
+    n = MODX(n, a.order);
    
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);
@@ -175,9 +183,8 @@ i.e., a is divided with b by broadcasting
 >> c - where we put result. we save it in a if c is NULL
 >> n - the dimension index
 >> alpha - the scaling factor
->> requireLink - if add operation to network
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -186,7 +193,7 @@ void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, 
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

-    if (requireLink) {
+    if (c.enableGrad == true) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -59,7 +59,7 @@ c(i) = a/b + \alpha * c
 where the size of b is equal to the n-th dimension of a, 
 i.e., a is divided with b by broadcasting 
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }

 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
@@ -140,16 +151,35 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
    XTensor c(&a);
    c.SetTMPFlag();

-    /* call _Sum function */
+    /* call _Mask function */
    _Mask(&a, &mask, &c, alpha);

    /* tensor connections */
-    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
-    //XLink::AddParamToHead(&c, alpha);
-    // TODO!!
-    ShowNTErrors("TODO!");
+    XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+    XLink::AddParamToHead(&c, alpha);

    return c;
 }

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Mask function */
+    _Mask(&a, &mask, &c, alpha);
+
+    if (c.enableGrad) {
+        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
+
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -16,10 +16,10 @@
 */

 /*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
-* I'll attend several conferences and workshops in the following weeks -
-* busy days :(
-*/
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2019-04-24
+ * I'll attend several conferences and workshops in the following weeks -
+ * busy days :(
+ */

 #ifndef __MASK_H__
 #define __MASK_H__
@@ -28,21 +28,22 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/*
+/* 
 mask entries of a given tensor:
 c(i) = a(i) if mask(i) is non-zero
 c(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha = 0.0);

-/*
+/* 
 mask entries of a given tensor (on site):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha = 0.0);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha = 0.0);

 /*
 mask entries of a given tensor (return an XTensor structure):
@@ -52,7 +53,14 @@ where i is the index of the element
 */
 XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha = 0.0);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MASK_H__
-
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -106,9 +106,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cBlockNum *= b->dimSizeRDI[i];
    }

-    XList * aList = new XList(10);
-    XList * bList = new XList(10);
-    XList * cList = new XList(10);
+    TensorList * aList = new TensorList(10);
+    TensorList * bList = new TensorList(10);
+    TensorList * cList = new TensorList(10);
    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
@@ -200,7 +200,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

-bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                    const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                    XTensor * c)
 {
    if (!(a && b && c))
        return false;
@@ -229,10 +231,13 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTen
    dimSize[sub++] = bm;

    for (int i = 0; i < order; i++) {
-        if (dimSize[i] != c->dimSize[i])
+        if (dimSize[i] != c->dimSize[i]) {
+            delete[] dimSize;
            return false;
+        }
    }
-
+    
+    delete[] dimSize;
    return true;
 }

@@ -355,11 +360,9 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,

    return c;
 }
-
-
 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
-    const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
+               DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -392,9 +395,9 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    }

    /* call _MatrixMul function */
-    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, transposedA);
@@ -455,7 +458,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 }

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -490,7 +493,7 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, X_NOTRANS);

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -40,8 +40,11 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                XTensor * c,
+                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+                XPRunner * parallelRunner = NULL);

 /* 
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
@@ -56,22 +59,28 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                  TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha = (DTYPE)1.0, 
+                  XPRunner * parallelRunner = NULL);
+				  
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                  TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, 
+				  XPRunner * parallelRunner = NULL);
+
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+               XTensor &c, 
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+               XPRunner * parallelRunner = NULL);

 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
-    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
-
+               DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -54,15 +54,15 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");

-	int an = a->dimSize[0], am = a->dimSize[1];
-	int bn = b->dimSize[0], bm = b->dimSize[1];
-	int cn = c->dimSize[0], cm = c->dimSize[1];
-	int am2 = transposedA == X_TRANS ? an : am;
-	int an2 = transposedA == X_TRANS ? am : an;
-	int bm2 = transposedB == X_TRANS ? bn : bm;
-	int bn2 = transposedB == X_TRANS ? bm : bn;
-	int cm2 = cm;
-	int cn2 = cn;
+    int an = a->dimSize[0], am = a->dimSize[1];
+    int bn = b->dimSize[0], bm = b->dimSize[1];
+    int cn = c->dimSize[0], cm = c->dimSize[1];
+    int am2 = transposedA == X_TRANS ? an : am;
+    int an2 = transposedA == X_TRANS ? am : an;
+    int bm2 = transposedB == X_TRANS ? bn : bm;
+    int bn2 = transposedB == X_TRANS ? bm : bn;
+    int cm2 = cm;
+    int cn2 = cn;

    CheckNTErrors((am2 == bn2 && an2 == cn2 && bm2 == cm2),
                  "Unmatched tensors in multiplication!");

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,17 +38,23 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void _MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(TensorList * args)
 {
-    int x1 = *(int*)args->GetItem(0);
-    int y1 = *(int*)args->GetItem(1);
-    int x2 = *(int*)args->GetItem(2);
-    int y2 = *(int*)args->GetItem(3);
-    XTensor * a = (XTensor*)args->GetItem(4);
-    XTensor * b = (XTensor*)args->GetItem(5);
-    XTensor * c = (XTensor*)args->GetItem(6);
-    DTYPE alpha = *(DTYPE*)args->GetItem(7);
-    DTYPE beta = *(DTYPE*)args->GetItem(8);
+    CheckNTErrors(args->count == 2, "invalid argument number!");
+    IntList * indexArgs = (IntList*)args->GetItem(0);
+    TensorList * matrixArgs = (TensorList*)args->GetItem(1);
+    CheckNTErrors(indexArgs->count == 4, "invalid argument number!");
+    CheckNTErrors(matrixArgs->count == 5, "invalid argument number!");
+
+    XTensor * a = matrixArgs->GetItem(0);
+    XTensor * b = matrixArgs->GetItem(1);
+    XTensor * c = matrixArgs->GetItem(2);
+    DTYPE alpha = *(DTYPE*)(matrixArgs->GetItem(3));
+    DTYPE beta = *(DTYPE*)(matrixArgs->GetItem(4));
+    int x1 = indexArgs->GetItem(0);
+    int y1 = indexArgs->GetItem(1);
+    int x2 = indexArgs->GetItem(2);
+    int y2 = indexArgs->GetItem(3);

 #ifdef FAST_MATRIX
    int am = a->dimSize[1];

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-void _MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(TensorList * args);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -154,7 +154,7 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
 {
-CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors(a && b && c, "Empty input tensors!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                 "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
@@ -227,9 +227,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                          XList * c, DTYPE alpha, DTYPE beta)
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA,
+                          const TensorList * b, MATRIX_TRANS_TYPE transposedB,
+                          TensorList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors(a && b && c, "Empty input lists!");
    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -58,8 +58,8 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, cons
 matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta (for list inputs)
 optimized for GPU
 */
-void _MatrixMulBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMulBatchedCPU(const TensorList * a, MATRIX_TRANS_TYPE transposedA, const TensorList * b, MATRIX_TRANS_TYPE transposedB, 
+                          TensorList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 /*
 matrix multiplication of the two tensors (return an XTensor structure) c = trans(a) * trans(b) * alpha

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -66,7 +66,7 @@ operation c = x * w + b  MulAndShift
 << return - the result of matrix multiplication
 */
 XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
-                  DTYPE alpha, XPRunner * parallelRunner)
+                    DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(x.dataType == w.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(x.order >= 2 && w.order >= 2, "Input tensors must have a order >= 2!");
@@ -129,9 +129,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    DelTensorBuf(tmp);

    return c;
-
 }

-
-
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)


 XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+                    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);


 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -21,6 +21,7 @@

 #include "../../XTensor.h"
 #include "../../XName.h"
+#include "../../XUtility.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 #include "MultiplyDim.h"
@@ -41,12 +42,15 @@ where i is the index of the item
 */
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
                  "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), 
                  "Unmatched tensors!");

+    CheckDev(a->devID, b->devID);
+
+    int leadingDimRDI = a->order - leadingDim - 1;
+
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
        _CudaMultiply(a, b, c, alpha, leadingDim);
@@ -139,6 +143,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
    _Multiply(a, b, a, alpha, leadingDim);
 }

+/*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
+
 /* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor
@@ -226,9 +247,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -242,7 +262,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _Multiply function */
        _Multiply(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
            XLink::AddParamToHead(&c, alpha);
@@ -253,7 +273,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -122,10 +122,10 @@ where i is the item index
 */
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
-	int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+    int leadingDimRDI = a->order - leadingDim - 1;
+    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");

    int stride = 1;
    int blockSizeA = 1;

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -19,6 +19,7 @@
 * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
 */

+#include <math.h>
 #include "Multiply.h"
 #include "MultiplyDim.h"
 #include "MultiplyDim.cuh"
@@ -42,8 +43,10 @@ i.e., a is multiplied with b by broadcasting
 >> n - the dimension index
 >> alpha - the scaling factor
 */
-void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) {
-    
+void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE alpha) 
+{
+    n = MODX(n, a->order);
+
    CheckNTErrors(a && b && c, "Empty tensor input!");
    CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in multiplication!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
@@ -52,6 +55,8 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
    CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
    CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");

+    CheckDev(a->devID, b->devID);
+
    if(XTensor::IsSameShaped(a, b)){
        _Multiply(a, b, c, alpha);
        return;
@@ -134,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }

 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it

@@ -151,6 +174,8 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n)
    XTensor c(&a);
    c.SetTMPFlag();

+    n = MODX(n, a.order);
+
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

@@ -173,9 +198,8 @@ i.e., a is multiplied with b by broadcasting
 >> b - another tensor whose size is equal to that of dimension n of a
 >> c - where we put a * b + \alpha * c. we save it in a if c is NULL
 >> n - the dimension index
->> requireLink - if add operation to network
 */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -184,7 +208,7 @@ void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool req
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -340,9 +364,8 @@ where some of dimensions of b can be of size 1
 >> a - a tensor
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
->> requireLink - if add operation to network
 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -351,7 +374,7 @@ void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requ
    /* call _SumBroadcast function */
    _MultiplyBroadcast(&a, &b, &c, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
        XLink::AddParamToHead(&c, 0);

--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -17,6 +17,7 @@

 /*
 * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+  * $Updated by: LinYe (email: linye2015@outlook.com) 2019-07-30 float16 added
 */

 #include "../../XDevice.h"

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);

 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
@@ -40,7 +41,7 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);

 /* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -50,7 +51,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Negate(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaNegate(a, b);
-    return;
-    }
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = -d[i];
-}
-
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _NegateMe(XTensor * a)
-{
-    _Negate(a, a);
-}
-
-/*
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the minus value of input tensor
-*/
-XTensor Negate(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    
-    /* call _Negate function */
-    _Negate(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    
-    return b;
-}
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Negate(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Negate function */
-    _Negate(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its negtive value (CUDA Kernel)
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-template <class T>
-__global__
-void KernelNegate(T * a, T * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = -a[i];
-}
-
-
-/*
-set each entry to its negtive value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaNegate(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_CUH__
-#define __NEGATE_CUH__
-
-#include "Negate.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its negtive value (CUDA Kernel) */
-template <class T>
-__global__
-void KernelNegate(T * a, T * b, int size);
-
-/* set each entry to its negtive value */
-void _CudaNegate(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_H__
-#define __NEGATE_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its minus value */
-void _Negate(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _NegateMe(XTensor * a);
-
-/* 
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Negate(const XTensor & a);
-
-/* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Sign(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaSign(a, b);
-    return;
-}
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++) {
-        if (d[i] > 0)
-            db[i] = 1.0F;
-        else if (d[i] == 0)
-            db[i] = 0.0F;
-        else
-            db[i] = -1.0F;
-    }
-}
-
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _SignMe(XTensor * a)
-{
-    _Sign(a, a);
-}
-
-/*
-set every entry to its sign value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the sign value of the input tensor
-*/
-XTensor Sign(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-
-    return b;
-}
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Sign(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/OnehotAndIndex.cpp
+++ b/source/tensor/core/getandset/OnehotAndIndex.cpp
--- a/source/tensor/core/getandset/OnehotAndIndex.cu
+++ b/source/tensor/core/getandset/OnehotAndIndex.cu
--- a/source/tensor/core/getandset/OnehotAndIndex.cuh
+++ b/source/tensor/core/getandset/OnehotAndIndex.cuh
--- a/source/tensor/core/getandset/OnehotAndIndex.h
+++ b/source/tensor/core/getandset/OnehotAndIndex.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip+Gather+SetData.txt
+++ b/source/tensor/core/math/Clip+Gather+SetData.txt
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.cuh
+++ b/source/tensor/core/math/Compare.cuh
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/movement/Spread.cu
+++ b/source/tensor/core/movement/Spread.cu
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumAll.cpp
+++ b/source/tensor/core/reduce/ReduceSumAll.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/FlushToMem.cpp
+++ b/source/tensor/core/utilities/FlushToMem.cpp
--- a/source/tensor/core/utilities/FlushToMem.cu
+++ b/source/tensor/core/utilities/FlushToMem.cu
--- a/source/tensor/core/utilities/FlushToMem.cuh
+++ b/source/tensor/core/utilities/FlushToMem.cuh
--- a/source/tensor/core/utilities/FlushToMem.h
+++ b/source/tensor/core/utilities/FlushToMem.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/CrossEntropy.cpp
+++ b/source/tensor/function/CrossEntropy.cpp
--- a/source/tensor/function/CrossEntropy.cu
+++ b/source/tensor/function/CrossEntropy.cu
--- a/source/tensor/function/CrossEntropy.cuh
+++ b/source/tensor/function/CrossEntropy.cuh
--- a/source/tensor/function/CrossEntropy.h
+++ b/source/tensor/function/CrossEntropy.h
--- a/source/tensor/function/Dropout.cpp
+++ b/source/tensor/function/Dropout.cpp
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cpp
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cpp
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cu
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cu
--- a/source/tensor/function/SoftmaxWithCrossEntropy.cuh
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.cuh
--- a/source/tensor/function/SoftmaxWithCrossEntropy.h
+++ b/source/tensor/function/SoftmaxWithCrossEntropy.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
--- a/source/timer.cpp
+++ b/source/timer.cpp
--- a/source/timer.h
+++ b/source/timer.h