merge float16/int8 implement into liyinqiao'sbranch

29d2352b · ltb · 2c4061e9 · 29d2352b · 29d2352b · 29d2352b
Commit 29d2352b authored Jul 30, 2019 by ltb
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -24,6 +24,7 @@
 #include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
+#include "../tensor/test/Test.h"
 #include "../sample/fnnlm/FNNLM.h"
 #include "../sample/transformer/Transformer.h"

@@ -44,7 +45,9 @@ int main( int argc, const char ** argv )
    //_CrtSetDbgFlag(_CrtSetDbgFlag(_CRTDBG_REPORT_FLAG) | _CRTDBG_LEAK_CHECK_DF);
    //_CrtSetBreakAlloc(2708);

-    if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    if(argc > 1 && !strcmp(argv[1], "-test"))
+        Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
        FNNLMMain(argc - 1, argv + 1);
    else if(argc > 1 && !strcmp(argv[1], "-t2t"))
        TransformerMain(argc - 1, argv + 1);
@@ -53,6 +56,7 @@ int main( int argc, const char ** argv )
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
+        fprintf(stderr, "Or run this program with \"-t2t\" for sample Transformer!\n");
    }

    //_CrtDumpMemoryLeaks();
@@ -67,6 +71,9 @@ void BackwardTest()
    XTensor a;
    XTensor b;
    XTensor c;
+    a.enableGrad = true;
+    b.enableGrad = false;
+    c.enableGrad = false;
    XTensor mean;
    XTensor origin;
    InitTensor2D(&a, 2, 3);
@@ -84,14 +91,15 @@ void BackwardTest()
    b.Set1D(2.0F, 0);
    b.Set1D(1.0F, 1);

-    c = DivDim(a, b, 0);
+    DivDim(a, b, c, 0);
    c.Dump(stderr, "c:");
+    auto loss = CrossEntropy(c, a);

    //XLink::ShowNetwork(stderr, &c);

-    net.Backward(c);
+    net.Backward(loss);

-    net.Dump(stderr);
+    a.grad->Dump(stderr);

 }


--- a/source/network/XBackwardData.cpp
+++ b/source/network/XBackwardData.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for data operation
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+ */
+
+#include "XNoder.h"
+#include "XBackwardData.h"
+#include "../tensor/XName.h"
+#include "../tensor/XUtility.h"
+#include "../tensor/core/CHeader.h"
+#include "../tensor/core/getandset/SetData.h"
+
+namespace nts{
+
+/* compute dE/dx of a node */
+void XDataGrad::MakeGrad(XTensor * node, bool isEfficent)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+
+    XLink &income = node->income;
+    int operID = income.typeID;
+
+    if(operID == GETANDSET_CONVERTDATATYPE)
+        GradConvertDataType(node, isEfficent);
+    else if(operID == GETANDSET_INDEXTOONEHOT)
+        GradIndexToOnehot(node, isEfficent);
+    else if(operID == GETANDSET_ONEHOTTOINDEX)
+        GradOnehotToIndex(node, isEfficent);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+
+/* indicates whether the node is for a data operation */
+bool XDataGrad::IsDataOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & DATA_BASE) != 0;
+}
+
+/* 
+gradient computation for convert datatype
+for
+b = converdatatype(a) 
+we have
+dE/da = convertdatatype(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradConvertDataType(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for ConvertDataType!");
+
+    XTensor * input = income.tails[0];
+
+    XNoder::MakeGrad(input);
+
+	XTensor * tmp = NewTensorBuf(input->grad, input->devID, input->mem);
+    _ConvertDataType(node->grad, tmp);
+	_SumMe(input->grad, tmp);
+	DelTensorBuf(tmp);
+
+	node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient computation for OnehotToIndex
+for
+b = OnehotToIndex(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradOnehotToIndex(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+
+    XTensor * input = income.tails[0];
+
+    XNoder::MakeGrad(input);
+
+	node->visitMark = NODE_FINISHED;
+}
+
+/* 
+gradient computation for IndexToOnehot
+for
+b = IndexToOnehot(a) 
+we have
+dE/da = IndexToOnehot(b)
+>> node - the node (c) for backward computation
+>> isEfficient - indicates whether the computation is in
+                 an efficient manner
+*/
+void XDataGrad::GradIndexToOnehot(XTensor * node, bool isEfficent)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for IndexToOnehot!");
+
+    XTensor * input = income.tails[0];
+
+    XNoder::MakeGrad(input);
+
+	node->visitMark = NODE_FINISHED;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/network/XBackwardData.h
+++ b/source/network/XBackwardData.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * backward computation for data operation
+ * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-12-26
+ */
+
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+
+#ifndef __XBACKWARDDATA_H__
+#define __XBACKWARDDATA_H__
+
+namespace nts{
+
+/* this class computes the gradient for tensor data operation given a node */
+class XDataGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node, bool isEfficent);
+
+    /* indicates whether the node is for a shaping operation */
+    static
+    bool IsDataOP(XTensor * node);
+
+private:
+    
+    /* gradient computation for ConverDataType: b = converdatatype(a, datatype) */
+    static
+    void GradConvertDataType(XTensor * node, bool isEfficent);
+
+    /* gradient computation for IndexToOnehot: b = indextoonehot(a, num) */
+    static
+    void GradIndexToOnehot(XTensor * node, bool isEfficent);
+
+    /* gradient computation for OnehotToIndex: b = onehottoindex(a, num) */
+    static
+    void GradOnehotToIndex(XTensor * node, bool isEfficent);
+};
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif
\ No newline at end of file
--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
@@ -43,18 +43,18 @@ void XFuncGrad::MakeGrad(XTensor * node, bool isEfficient)
    XNoder::MakeGrad(input);

    if(operID == FUNC_HARDTANH)
-        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _HardTanHBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_IDENTITY)
-        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _IdentityBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_LOGSOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in logsoftmax!");
        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, NULL, leadDim, NOLOSS);
    }
    else if(operID == FUNC_RECTIFY)
-        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _RectifyBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SIGMOID)
-        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+        _SigmoidBackward(output, input, output->grad, input->grad);
    else if(operID == FUNC_SOFTMAX){
        int leadDim = income.GetParamInt(0);
        CheckNTErrors(leadDim >= 0 && leadDim < input->order, "wrong leading dimension in softmax!");

--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -52,15 +52,7 @@ void XLossGrad::MakeGrad(XTensor * node, bool isEfficient)
    XTensor * dedy = output->grad;

    if (income.tailNum == 1) {
-        if(dedy->dataType == X_FLOAT)
        _SetDataFixed(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixed(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixed(dedy, 1);
-        else
-            ShowNTErrors("TODO");
-
        return;
    }

@@ -98,39 +90,39 @@ compute dE/dx for a given function y = f(x)
 >> params - parameters of the function
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                        XTensor * dedy, XTensor * dedx, XTensor * padding,
-                        int funcID, void * params,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    CheckNTErrors(gold && y && x, "Empty input tensors!");
-    CheckNTErrors(dedx, "Empty gradient tensors!");
-    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
-
-    if(funcID == FUNC_HARDTANH){
-        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_IDENTITY){
-        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_LOGSOFTMAX){
-        int leadDim = *(int*)params;
-        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else if(funcID == FUNC_RECTIFY){
-        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
-    }
-    else if(funcID == FUNC_SIGMOID){
-        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
-    }else if(funcID == FUNC_SOFTMAX){
-        int leadDim = *(int*)params;
-        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
-    }
-    else{
-        ShowNTErrors("wrong function found when call the backward process!");
-    }
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+//                        XTensor * dedy, XTensor * dedx, XTensor * padding,
+//                        int funcID, void * params,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    CheckNTErrors(gold && y && x, "Empty input tensors!");
+//    CheckNTErrors(dedx, "Empty gradient tensors!");
+//    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+//
+//    if(funcID == FUNC_HARDTANH){
+//        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_IDENTITY){
+//        _IdentityBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_LOGSOFTMAX){
+//        int leadDim = *(int*)params;
+//        _LogSoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else if(funcID == FUNC_RECTIFY){
+//        _RectifyBackward(gold, y, x, dedy, dedx, lossName);
+//    }
+//    else if(funcID == FUNC_SIGMOID){
+//        _SigmoidBackward(gold, y, x, dedy, dedx, lossName);
+//    }else if(funcID == FUNC_SOFTMAX){
+//        int leadDim = *(int*)params;
+//        _SoftmaxBackward(gold, y, x, dedy, dedx, padding, leadDim, lossName);
+//    }
+//    else{
+//        ShowNTErrors("wrong function found when call the backward process!");
+//    }
+//
+//}

 /* 
 compute dE/dy for variable y and error(loss) function E
@@ -139,27 +131,27 @@ compute dE/dy for variable y and error(loss) function E
 >> dedy - dE/dy
 >> lossName - name of the loss, e.g., cross entropy
 */
-void XLossGrad::Compute(XTensor * gold, XTensor * y, 
-                        XTensor * dedy, XTensor * padding,
-                        LOSS_FUNCTION_NAME lossName)
-{
-    if(gold == NULL){
-        if(dedy->dataType == X_FLOAT)
-            _SetDataFixed(dedy, 1.0F);
-        else if(dedy->dataType == X_DOUBLE)
-            _SetDataFixed(dedy, 1.0);
-        else if(dedy->dataType == X_INT)
-            _SetDataFixed(dedy, 1);
-        else{
-            ShowNTErrors("TODO");
-        }
-        return;
-    }
-
-    //_LossBackward(dedy, gold, y, lossName);
-    if(lossName == CROSSENTROPY)
-        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
-
-}
+//void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+//                        XTensor * dedy, XTensor * padding,
+//                        LOSS_FUNCTION_NAME lossName)
+//{
+//    if(gold == NULL){
+//        if(dedy->dataType == X_FLOAT)
+//            _SetDataFixedFloat(dedy, 1.0F);
+//        else if(dedy->dataType == X_DOUBLE)
+//            _SetDataFixedDouble(dedy, 1.0);
+//        else if(dedy->dataType == X_INT)
+//            _SetDataFixedInt(dedy, 1);
+//        else{
+//            ShowNTErrors("TODO");
+//        }
+//        return;
+//    }
+//
+//    //_LossBackward(dedy, gold, y, lossName);
+//    if(lossName == CROSSENTROPY)
+//        _CrossEntropyBackward(dedy, y, gold, NULL, padding);
+//
+//}

 }
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
@@ -43,11 +43,11 @@ public:
    static
    bool IsLossOP(XTensor * node);

-    /* compute dE/dx for a given function y = f(x) */
-    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
-                 XTensor * dedy, XTensor * dedx, XTensor * padding,
-                 int funcID, void * params,
-                 LOSS_FUNCTION_NAME lossName);
+    ///* compute dE/dx for a given function y = f(x) */
+    //void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+    //             XTensor * dedy, XTensor * dedx, XTensor * padding,
+    //             int funcID, void * params,
+    //             LOSS_FUNCTION_NAME lossName);

    /* compute dE/dy for variable y and error(loss) function E */
    void Compute(XTensor * gold, XTensor * y, 

--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -68,7 +68,7 @@ void XShapeGrad::MakeGrad(XTensor * node, bool isEfficent)
 bool XShapeGrad::IsShapeOP(XTensor * node)
 {
    XLink &income = node->income;
-    return (income.typeID & DATA_BASE) != 0;
+    return (income.typeID & SHAPE_BASE) != 0;
 }

 /* post processing of a node */

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -24,6 +24,7 @@
 #include "XBackwardLoss.h"
 #include "XBackwardMath.h"
 #include "XBackwardFunc.h"
+#include "XBackwardData.h"
 #include "XBackwardShape.h"
 #include "../tensor/XName.h"

@@ -264,6 +265,8 @@ void XNet::BackwardNode(XTensor * node, bool isEfficent)
            XMathGrad::MakeGrad(node, isEfficent);
        else if(XFuncGrad::IsFunc(node))
            XFuncGrad::MakeGrad(node, isEfficent);
+		else if (XDataGrad::IsDataOP(node))
+			XDataGrad::MakeGrad(node, isEfficent);
        else if(XShapeGrad::IsShapeOP(node))
            XShapeGrad::MakeGrad(node, isEfficent);
        else if(XLossGrad::IsLossOP(node))

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
--- a/source/sample/transformer/T2TAttention.cpp
+++ b/source/sample/transformer/T2TAttention.cpp
@@ -51,14 +51,12 @@ initialize the model
 >> myIgnored - number of position ignored in attention (from the begining)
 >> myIsMasked - indicates whether the attention is with a mask
 >> myDevID - device id
->> myMem - the memory pool
 */
 void T2TAttention::InitModel(int argc, char ** argv, 
                             bool myIsMasked, int myIgnored, 
-                             int myDevID, XMem * myMem)
+                             int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    isMasked = myIsMasked;
    ignored = myIgnored;
    
@@ -71,11 +69,11 @@ void T2TAttention::InitModel(int argc, char ** argv,
    LoadParamFloat(argc, argv, "attminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutatt", &dropoutP, 0);

-    InitTensor2D(&wk, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wq, d, dk, X_FLOAT, devID, mem);
-    InitTensor2D(&wv, d, dv, X_FLOAT, devID, mem);
-    InitTensor2D(&wa, d, d, X_FLOAT, devID, mem);
-    InitTensor2D(&wbig, d, 3 * d, X_FLOAT, devID, mem);
+    InitTensor2DV2(&wk, d, dk, X_FLOAT, devID);
+    InitTensor2DV2(&wq, d, dk, X_FLOAT, devID);
+    InitTensor2DV2(&wv, d, dv, X_FLOAT, devID);
+    InitTensor2DV2(&wa, d, d, X_FLOAT, devID);
+    InitTensor2DV2(&wbig, d, 3 * d, X_FLOAT, devID);

    float scale = 1.0F;
    float finfoutk = (float)sqrt(6.0F * scale/(d + dk));
@@ -135,9 +133,9 @@ XTensor T2TAttention::MakeBig(XTensor &kqv, XTensor &mask, bool isTraining)
    int d2 = kqv2.GetDim(1);
    int d3 = kqv2.GetDim(2) / 3;
    
-    InitTensor3D(&k2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&q2, d1, d2, d3, X_FLOAT, devID, mem);
-    InitTensor3D(&v2, d1, d2, d3, X_FLOAT, devID, mem);
+    InitTensor3DV2(&k2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&q2, d1, d2, d3, X_FLOAT, devID);
+    InitTensor3DV2(&v2, d1, d2, d3, X_FLOAT, devID);
    
    split.Add(&q2);
    split.Add(&k2);

--- a/source/sample/transformer/T2TAttention.h
+++ b/source/sample/transformer/T2TAttention.h
@@ -42,9 +42,6 @@ public:
    /* device id */
    int devID;
    
-    /* memory pool */
-    XMem * mem;
-    
    /* head number */
    int nhead;

@@ -94,7 +91,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &k, XTensor &q, XTensor &v, XTensor &mask, bool isTraining);

--- a/source/sample/transformer/T2TBatchLoader.cpp
+++ b/source/sample/transformer/T2TBatchLoader.cpp
@@ -62,7 +62,7 @@ initialization
 */
 void T2TBatchLoader::Init(int argc, char ** argv)
 {
-    LoadParamInt(argc, argv, "bufsize", &bufSize, 65000);
+    LoadParamInt(argc, argv, "bufsize", &bufSize, 50000);
    LoadParamBool(argc, argv, "doubledend", &isDoubledEnd, false);
    LoadParamBool(argc, argv, "smallbatch", &isSmallBatch, true);
    LoadParamBool(argc, argv, "bigbatch", &isBigBatch, false);
@@ -167,8 +167,7 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)
        if(wordCount >= bufSize - MAX_SEQUENCE_LENGTH)
            break;

-        CheckNTErrors(seqCount % step == 0, 
-                      "Wrong number of sequences! This line might be too long!");
+        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
    }

    nseqBuf = seqCount;
@@ -176,8 +175,7 @@ int T2TBatchLoader::LoadBuf(FILE * file, bool isSorted, int step)

    /* sort the sequences by length */
    if (isSorted) {
-        CheckNTErrors(seqCount % step == 0, 
-                      "Wrong number of sequences! This line might be too long!");
+        CheckNTErrors(seqCount % step == 0, "Wrong number of sequences!");
        SampleNode * nodes = new SampleNode[seqCount];
        int count = 0;
        int offset = 0;
@@ -282,7 +280,6 @@ load a batch of sequences
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatch(FILE * file, bool isLM, 
@@ -292,18 +289,17 @@ int T2TBatchLoader::LoadBatch(FILE * file, bool isLM,
                          int * seqs,
                          int vsEnc, int vsDec, int sBatch, int wBatch, 
                          bool isSorted, int &ws, int &wCount,
-                          int devID, XMem * mem, 
-						  bool isTraining)
+                          int devID, bool isTraining)
 {
    if(isLM){
        return LoadBatchLM(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
                           seqs, vsEnc, sBatch, wBatch, 
-                           isSorted, wCount, devID, mem, isTraining);
+                           isSorted, wCount, devID, isTraining);
    }
    else{
        return LoadBatchMT(file, batchEnc, paddingEnc, batchDec, paddingDec, gold, label,
                           seqs, vsEnc, vsDec, sBatch, wBatch, 
-                           isSorted, ws, wCount, devID, mem, isTraining);
+                           isSorted, ws, wCount, devID, isTraining);
    }
 }

@@ -324,7 +320,6 @@ load a batch of sequences (for LM)
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatchLM(FILE * file, 
@@ -334,8 +329,7 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
                            int * seqs,
                            int vSize, int sBatch, int wBatch, 
                            bool isSorted, int &wCount,
-                            int devID, XMem * mem,
-							bool isTraining)
+                            int devID, bool isTraining)
 {
    if(nextSeq < 0 || nextSeq >= nseqBuf)
        LoadBuf(file, isSorted, 1);
@@ -371,11 +365,11 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    dims[1] = max;
    dims[2] = vSize;

-    InitTensor2D(batchEnc, sc, max, X_INT, devID, mem);
-    InitTensor2D(label, sc, max, X_INT, devID, mem);
-    InitTensor(gold, 3, dims, X_FLOAT, 1.0F, devID, mem);
-    InitTensor2D(paddingEnc, sc, max, X_FLOAT, devID, mem);
-    InitTensor2D(paddingDec, sc, max, X_FLOAT, devID, mem);
+    InitTensor2DV2(batchEnc, sc, max, X_INT, devID);
+    InitTensor2DV2(label, sc, max, X_INT, devID);
+    InitTensorV2(gold, 3, dims, X_FLOAT, devID);
+    InitTensor2DV2(paddingEnc, sc, max, X_FLOAT, devID);
+    InitTensor2DV2(paddingDec, sc, max, X_FLOAT, devID);

    batchEnc->SetZeroAll();
    label->SetZeroAll();
@@ -439,12 +433,12 @@ int T2TBatchLoader::LoadBatchLM(FILE * file,
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCount);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCount);

-    /*XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    /*XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
    _ConvertDataType(batchEnc, tmp);
    _NotEqual(tmp, paddingEnc, 0);
    DelTensorBuf(tmp);
        
-    XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
    _ConvertDataType(batchEnc, tmp2);
    _NotEqual(tmp2, paddingDec, 0);
    DelTensorBuf(tmp2);*/
@@ -483,7 +477,6 @@ load a batch of sequences (for MT)
 >> isSorted - indicates whether the sequences are sorted by length
 >> wCount - word count
 >> devID - device id
->> mem - memory pool
 >> isTraining - indicates whether we are training the model
 */
 int T2TBatchLoader::LoadBatchMT(FILE * file, 
@@ -493,8 +486,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
                            int * seqs,
                            int vSizeEnc, int vSizeDec, int sBatch, int wBatch, 
                            bool isSorted, int &ws, int &wCount,
-                            int devID, XMem * mem, 
-							bool isTraining)
+                            int devID, bool isTraining)
 {
    if (nextBatch < 0 || nextBatch >= bufBatchSize) {
        LoadBuf(file, isSorted, 2);
@@ -571,12 +563,12 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    int sCount = sc/2;
    int seqSize = 0;

-    InitTensor2D(batchEnc, sCount, maxEnc, X_INT, devID, mem);
-    InitTensor2D(paddingEnc, sCount, maxEnc, X_FLOAT, devID, mem);
-    InitTensor2D(batchDec, sCount, maxDec, X_INT, devID, mem);
-    InitTensor2D(paddingDec, sCount, maxDec, X_FLOAT, devID, mem);
-    InitTensor2D(label, sCount, maxDec, X_INT, devID, mem);
-    //InitTensor(gold, 3, dimsDec, X_FLOAT, 1.0F, devID, mem);
+    InitTensor2DV2(batchEnc, sCount, maxEnc, X_INT, devID);
+    InitTensor2DV2(paddingEnc, sCount, maxEnc, X_FLOAT, devID);
+    InitTensor2DV2(batchDec, sCount, maxDec, X_INT, devID);
+    InitTensor2DV2(paddingDec, sCount, maxDec, X_FLOAT, devID);
+    InitTensor2DV2(label, sCount, maxDec, X_INT, devID);
+    //InitTensorV2(gold, 3, dimsDec, X_FLOAT, devID);

    batchEnc->SetZeroAll();
    paddingEnc->SetZeroAll();
@@ -615,7 +607,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    ws = wCountEnc;
    batchEnc->SetData(batchEncValues, batchEnc->unitNum);
    paddingEnc->SetDataBatched(paddingEncOffsets, 1.0F, wCountEnc);
-    //XTensor * tmp = NewTensorBuf(paddingEnc, devID, mem);
+    //XTensor * tmp = NewTensorBufV2(paddingEnc, devID);
    //_ConvertDataType(batchEnc, tmp);
    //tmp->Dump(stderr, "tmp:");
    //_NotEqual(tmp, paddingEnc, 0);
@@ -664,7 +656,7 @@ int T2TBatchLoader::LoadBatchMT(FILE * file,
    label->SetData(labelValues, label->unitNum);
    paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountPad);

-    //XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
+    //XTensor * tmp2 = NewTensorBufV2(paddingDec, devID);
    //_ConvertDataType(batchDec, tmp2);
    //_NotEqual(tmp2, paddingDec, 0);
    //DelTensorBuf(tmp2);

--- a/source/sample/transformer/T2TBatchLoader.h
+++ b/source/sample/transformer/T2TBatchLoader.h
@@ -30,7 +30,7 @@ using namespace nts;
 namespace transformer
 {

-#define MAX_SEQUENCE_LENGTH 1024 * 16
+#define MAX_SEQUENCE_LENGTH 1024 * 4

 /* node to keep batch information */
 struct BatchNode
@@ -131,8 +131,7 @@ public:
                  int * seqs,
                  int vsEnc, int vsDec, int sBatch, int wBatch, 
                  bool isSorted, int &ws, int &wCount,
-                  int devID, XMem * mem, 
-				  bool isTraining);
+                  int devID, bool isTraining);

    /* load a batch of sequences (for language modeling) */
    int LoadBatchLM(FILE * file, 
@@ -141,8 +140,7 @@ public:
                    XTensor * gold, XTensor * label,
                    int * seqs, int vs, int sBatch, int wBatch, 
                    bool isSorted, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
+                    int devID, bool isTraining);

    /* load a batch of sequences (for machine translation) */
    int LoadBatchMT(FILE * file, 
@@ -151,8 +149,7 @@ public:
                    XTensor * gold, XTensor * label,
                    int * seqs, int vsEnc, int vsDec, int sBatch, int wBatch, 
                    bool isSorted, int &ws, int &wCount,
-                    int devID, XMem * mem, 
-					bool isTraining);
+                    int devID, bool isTraining);

    /* shuffle the data file */
    void Shuffle(const char * srcFile, const char * tgtFile);

--- a/source/sample/transformer/T2TDecoder.cpp
+++ b/source/sample/transformer/T2TDecoder.cpp
@@ -57,16 +57,14 @@ initialize the model
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
 >> myDevID - device id
->> myMem - the memory pool
 */
 void AttDecoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
-    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
+    //AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID);

    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;

    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -79,7 +77,7 @@ void AttDecoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsizetgt\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem, false);
+    embedder.InitModel(argc, argv, devID, false);

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -90,12 +88,12 @@ void AttDecoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for (int i = 0; i < nlayer; i++) {
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID, myMem);
-        attEndeLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
+        attentionsEnde[i].InitModel(argc, argv, true, myIgnored, myDevID);
+        attEndeLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }


--- a/source/sample/transformer/T2TDecoder.h
+++ b/source/sample/transformer/T2TDecoder.h
@@ -37,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -95,7 +92,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the decoding network */
    XTensor Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TEmbedding.cpp
+++ b/source/sample/transformer/T2TEmbedding.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TEmbedder::T2TEmbedder()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    maxLength = -1;
 }
@@ -46,12 +45,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, bool isEnc)
+void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, bool isEnc)
 {
    devID = myDevID;
-    mem = myMem;
    
    if(isEnc){
        LoadParamInt(argc, argv, "vsize", &vSize, -1);
@@ -64,7 +61,7 @@ void T2TEmbedder::InitModel(int argc, char ** argv, int myDevID, XMem * myMem, b
    LoadParamInt(argc, argv, "d", &eSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor2D(&w, vSize, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, vSize, eSize, X_FLOAT, devID);

    DTYPE v = 1.0F/(float)sqrt((float)eSize);
    w.SetDataRandn(0, v);
@@ -81,7 +78,7 @@ make positional embeddings (of size eSize * length)
 */
 void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
 {
-    InitTensor2D(&posEmbeddingBase, length, eSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&posEmbeddingBase, length, eSize, X_FLOAT, devID);

    float * data = new float[posEmbeddingBase.unitNum];

@@ -145,9 +142,9 @@ XTensor T2TEmbedder::Make(XTensor &input)
    /* we make positional embeddings first */
    //if(!match){
    if(true){
-        InitTensor(&posEmbedding, input.order + 1, dims, X_FLOAT, 1.0F, devID, mem);
+        InitTensorV2(&posEmbedding, input.order + 1, dims, X_FLOAT, devID);

-        XTensor * posTMP = NewTensorBuf(2, dims + 1, X_FLOAT, 1.0F, devID, mem);
+        XTensor * posTMP = NewTensorBufV2(2, dims + 1, X_FLOAT, devID);

        _CopyValues(&posEmbeddingBase, 0, posTMP->unitNum, posTMP, 0);
        _Unsqueeze(posTMP, &posEmbedding, 0, dims[0]);

--- a/source/sample/transformer/T2TEmbedding.h
+++ b/source/sample/transformer/T2TEmbedding.h
@@ -41,9 +41,6 @@ public:
    /* device id */
    int devID;
    
-    /* memory pool */
-    XMem * mem;
-    
    /* vocabulary size */
    int vSize;

@@ -71,7 +68,7 @@ public:
    ~T2TEmbedder();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL, bool isEnc = true);
+    void InitModel(int argc, char ** argv, int myDevID = -1, bool isEnc = true);

    /* make positional embeddings */
    void MakePosEmbedding(int eSize, int d, int length);

--- a/source/sample/transformer/T2TEncoder.cpp
+++ b/source/sample/transformer/T2TEncoder.cpp
@@ -52,15 +52,12 @@ initialize the model
 >> argv - list of pointers to the arguments
 >> myIsMasked - indicates whether the masked attention is employed
 >> myIgnored - number of positions ignored in attention (from the start)
->> myDevID - device id
->> myMem - the memory pool
-*/
+>> myDevID - device id*/
 void AttEncoder::InitModel(int argc, char ** argv, 
                           bool myIsMasked, int myIgnored, 
-                           int myDevID, XMem * myMem)
+                           int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    ignored = myIgnored;
    
    LoadParamInt(argc, argv, "nlayer", &nlayer, 6);
@@ -73,7 +70,7 @@ void AttEncoder::InitModel(int argc, char ** argv,
    CheckNTErrors(vSize > 1, "set vocabulary size by \"-vsize\"");

    /* embedding model */
-    embedder.InitModel(argc, argv, devID, mem);
+    embedder.InitModel(argc, argv, devID);

    attentions = new T2TAttention[nlayer];
    fnns = new T2TFNN[nlayer];
@@ -82,10 +79,10 @@ void AttEncoder::InitModel(int argc, char ** argv,

    /* initialize the stacked layers */
    for(int i = 0; i < nlayer; i++){
-        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
-        fnns[i].InitModel(argc, argv, myDevID, myMem);
-        attLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
-        fnnLayerNorms[i].InitModel(argc, argv, myDevID, myMem);
+        attentions[i].InitModel(argc, argv, myIsMasked, myIgnored, myDevID);
+        fnns[i].InitModel(argc, argv, myDevID);
+        attLayerNorms[i].InitModel(argc, argv, myDevID);
+        fnnLayerNorms[i].InitModel(argc, argv, myDevID);
    }
 }


--- a/source/sample/transformer/T2TEncoder.h
+++ b/source/sample/transformer/T2TEncoder.h
@@ -65,9 +65,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* layer number */
    int nlayer;

@@ -118,7 +115,7 @@ public:
    /* initialize the model */
    void InitModel(int argc, char ** argv, 
                   bool myIsMasked, int myIgnored, 
-                   int myDevID = -1, XMem * myMem = NULL);
+                   int myDevID = -1);

    /* make the encoding network */
    XTensor Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, bool isTraining);

--- a/source/sample/transformer/T2TFNN.cpp
+++ b/source/sample/transformer/T2TFNN.cpp
@@ -47,12 +47,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TFNN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;
    
    float minmax = 0;

@@ -62,11 +60,11 @@ void T2TFNN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamFloat(argc, argv, "fnnminmax", &minmax, 0.1F);
    LoadParamFloat(argc, argv, "dropoutfnn", &dropoutP, 0);

-    InitTensor2D(&w1, inSize, hSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b1, hSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w1, inSize, hSize, X_FLOAT, devID);
+    InitTensor1DV2(&b1, hSize, X_FLOAT, devID);

-    InitTensor2D(&w2, hSize, outSize, X_FLOAT, devID, mem);
-    InitTensor1D(&b2, outSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w2, hSize, outSize, X_FLOAT, devID);
+    InitTensor1DV2(&b2, outSize, X_FLOAT, devID);

    float scale = 1.0F;
    float finfout1 = (float)sqrt(6.0F * scale/(inSize + hSize));

--- a/source/sample/transformer/T2TFNN.h
+++ b/source/sample/transformer/T2TFNN.h
@@ -36,9 +36,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* size of input vector */
    int inSize;

@@ -72,7 +69,7 @@ public:
    ~T2TFNN();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input, bool isTraining);

--- a/source/sample/transformer/T2TLayerNormal.cpp
+++ b/source/sample/transformer/T2TLayerNormal.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TLN::T2TLN()
 {
    devID = -1;
-    mem = NULL;
    d = 0;
 }

@@ -46,18 +45,16 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TLN::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TLN::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    d = 0;
    LoadParamInt(argc, argv, "d", &d, DEFAULT_EMBEDDING_SIZE);

-    InitTensor1D(&w, d, X_FLOAT, devID, mem);
-    InitTensor1D(&b, d, X_FLOAT, devID, mem);
+    InitTensor1DV2(&w, d, X_FLOAT, devID);
+    InitTensor1DV2(&b, d, X_FLOAT, devID);

    w.SetDataRand(1.0F, 1.0F);
    b.SetZeroAll();

--- a/source/sample/transformer/T2TLayerNormal.h
+++ b/source/sample/transformer/T2TLayerNormal.h
@@ -37,9 +37,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the transformation matrix w */
    XTensor w;

@@ -57,7 +54,7 @@ public:
    ~T2TLN();
    
    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);
    
    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TLengthPenalty.cpp
+++ b/source/sample/transformer/T2TLengthPenalty.cpp
@@ -35,7 +35,9 @@ XTensor T2TLengthPenalizer::GNMT(const XTensor & length, float alpha)
    XTensor base;
    XTensor lp;

+    //base = ScaleAndShift(ScaleAndShift(length, 0, 5.0F), 1.0F/(5 + 1));
    base = (length + 5)/(1 + 5);
+
    lp = Power(base, alpha);
    
    return lp;

--- a/source/sample/transformer/T2TModel.cpp
+++ b/source/sample/transformer/T2TModel.cpp
@@ -32,7 +32,6 @@ namespace transformer
 T2TModel::T2TModel()
 {
    devID = -1;
-    mem = NULL;
    isLM = false;
    isMT = false;
    nhead = 1;
@@ -48,10 +47,6 @@ T2TModel::~T2TModel()
    delete encoder;
    delete decoder;
    delete outputLayer;
-    
-    /* we delete "mem" at the end because other members are using it and we must
-       remove the memory space before all tensors are destroyed. */
-    delete mem;
 }

 /* 
@@ -61,29 +56,16 @@ initialize the model
 */
 void T2TModel::InitModel(int argc, char ** argv)
 {
-    bool useMem = false;
-    int memSize = 0;
-    bool isMemFreeOTF = false;
-
    LoadParamInt(argc, argv, "dev", &devID, -1);
-    LoadParamBool(argc, argv, "mem", &useMem, useMem);
-    LoadParamInt(argc, argv, "memsize", &memSize, 1024);
    LoadParamBool(argc, argv, "mt", &isMT, false);
    LoadParamBool(argc, argv, "lm", &isLM, !isMT);
    LoadParamInt(argc, argv, "nhead", &nhead, 8);
-    LoadParamBool(argc, argv, "freeotf", &isMemFreeOTF, false);
-
-    if(useMem){
-        delete mem;
-        mem = new XMem(devID, FREE_ON_THE_FLY, (MTYPE)MILLION * 256, 1024, MILLION * 128);
-        mem->SetDesiredSize(devID, 0, (MTYPE)memSize * MILLION);
-    }

-    encoder->InitModel(argc, argv, true, 0, devID, mem);
-    outputLayer->InitModel(argc, argv, devID, mem);
+    encoder->InitModel(argc, argv, true, 0, devID);
+    outputLayer->InitModel(argc, argv, devID);

    if(isMT)
-        decoder->InitModel(argc, argv, true, 0, devID, mem);
+        decoder->InitModel(argc, argv, true, 0, devID);

    TensorList params(10);
    GetParams(params);
@@ -149,7 +131,8 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
        dims[i + 1] = input.GetDim(i);
    dims[0] = nhead;
    dims[input.order + 1] = len;
-    XTensor mask(input.order + 2, dims, X_FLOAT, 1.0F, padding.devID, padding.mem);
+    XTensor mask;
+    InitTensorV2(&mask, input.order + 2, dims, X_FLOAT, padding.devID);

    /* a upper triangular matrix where the cells of the upper triangular are set to -1e-9.
        this matrix can be used to prevent the attention to current or following words in
@@ -163,15 +146,15 @@ void T2TModel::MakeLM(XTensor &input, XTensor &output, XTensor &padding, bool is
    dimsPadding[padding.order - 1] = padding.GetDim(-1);
    dimsPadding[padding.order] = padding.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(padding.order + 1, dimsPadding, padding.dataType,
-                                      padding.denseRatio, padding.devID, padding.mem);
+    XTensor * padding2 = NewTensorBufV2(padding.order + 1, dimsPadding, padding.dataType,
+                                        padding.devID);

    for(int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    //XTensor * padding3 = NewTensorBuf(padding.order + 2, dimsPadding, padding.dataType,
-    //                                  padding.denseRatio, padding.devID, padding.mem);
+    //XTensor * padding3 = NewTensorBufV2(padding.order + 2, dimsPadding, padding.dataType,
+    //                                    padding.devID);
    //    
    ///* mask of the padding */
    //_Unsqueeze(&padding, padding2, padding.order - 1, padding.GetDim(-1));
@@ -241,7 +224,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
        
    /* an upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       this matrix can be used to prevent the attention to current or following words in
@@ -251,11 +234,11 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);

-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+                                                paddingEnc.devID);
+    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);

    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));
    _ScaleAndShiftMe(maskEncDecTMPEnc, 1e9F, -1e9F);
@@ -271,15 +254,15 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);

-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+                                        paddingEnc.devID);

    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;

-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+                                        paddingEnc.devID);

    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -287,7 +270,7 @@ void T2TModel::MakeMTMask(XTensor &inputEnc,   XTensor &inputDec,

    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);

-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();

    /* generate the mask on the source language side (for padding) */
@@ -315,15 +298,15 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    dimsPadding[paddingEnc.order - 1] = paddingEnc.GetDim(-1);
    dimsPadding[paddingEnc.order] = paddingEnc.GetDim(-1);
    
-    XTensor * padding2 = NewTensorBuf(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * padding2 = NewTensorBufV2(paddingEnc.order + 1, dimsPadding, paddingEnc.dataType,
+                                        paddingEnc.devID);
    
    for (int i = 0; i < padding2->order; i++)
        dimsPadding[i + 1] = padding2->GetDim(i);
    dimsPadding[0] = nhead;
    
-    XTensor * padding3 = NewTensorBuf(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
-                                      paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
+    XTensor * padding3 = NewTensorBufV2(paddingEnc.order + 2, dimsPadding, paddingEnc.dataType,
+                                        paddingEnc.devID);
    
    /* mask of the padding */
    _Unsqueeze(&paddingEnc, padding2, paddingEnc.order - 1, paddingEnc.GetDim(-1));
@@ -331,7 +314,7 @@ void T2TModel::MakeMTMaskEnc(XTensor &inputEnc, XTensor &paddingEnc, XTensor &ma
    
    _ScaleAndShiftMe(padding3, 1e9F, -1e9F);
    
-    InitTensor(&maskEnc, padding3);
+    InitTensorV2(&maskEnc, padding3);
    maskEnc.SetZeroAll();
    
    /* generate the mask on the source language side (for padding) */
@@ -361,7 +344,7 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,
        dims[i + 1] = inputDec.GetDim(i);
    dims[0] = nhead;
    dims[inputDec.order + 1] = len;
-    InitTensor(&maskDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingDec.devID, paddingDec.mem);
+    InitTensorV2(&maskDec, inputDec.order + 2, dims, X_FLOAT, paddingDec.devID);
    
    /* An upper triangular matrix where the cells of the upper triangular are set to -1e-9.
       This matrix can be used to block the attention to current or following words in
@@ -376,11 +359,11 @@ void T2TModel::MakeMTMaskDec(XTensor &inputEnc, XTensor &inputDec,

    /* encoder-decoder mask that prevents the attention to padding dummy words */
    dims[inputDec.order + 1] = inputEnc.GetDim(inputEnc.order - 1);
-    InitTensor(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, 1.0F, paddingEnc.devID, paddingEnc.mem);
+    InitTensorV2(&maskEncDec, inputDec.order + 2, dims, X_FLOAT, paddingEnc.devID);
    
-    XTensor * maskEncDecTMPEnc = NewTensorBuf(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
-                                              paddingEnc.denseRatio, paddingEnc.devID, paddingEnc.mem);
-    XTensor * maskEncDecTMPDec = NewTensorBuf(maskEncDecTMPEnc, paddingEnc.devID, paddingEnc.mem);
+    XTensor * maskEncDecTMPEnc = NewTensorBufV2(paddingEnc.order + 1, dims + 1, paddingEnc.dataType,
+                                                paddingEnc.devID);
+    XTensor * maskEncDecTMPDec = NewTensorBufV2(maskEncDecTMPEnc, paddingEnc.devID);
    
    _Unsqueeze(&paddingEnc, maskEncDecTMPEnc, paddingEnc.order - 1, paddingDec.GetDim(-1));


--- a/source/sample/transformer/T2TModel.h
+++ b/source/sample/transformer/T2TModel.h
@@ -40,9 +40,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* the encoder */
    AttEncoder * encoder;


--- a/source/sample/transformer/T2TOutput.cpp
+++ b/source/sample/transformer/T2TOutput.cpp
@@ -31,7 +31,6 @@ namespace transformer
 T2TOutput::T2TOutput()
 {
    devID = -1;
-    mem = NULL;
    vSize = -1;
    inSize = -1;
    hSize = -1;
@@ -47,12 +46,10 @@ initialize the model
 >> argc - number of arguments
 >> argv - list of pointers to the arguments
 >> myDevID - device id
->> myMem - the memory pool
 */
-void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
+void T2TOutput::InitModel(int argc, char ** argv, int myDevID)
 {
    devID = myDevID;
-    mem = myMem;

    float minmax = 0;

@@ -61,7 +58,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
    LoadParamInt(argc, argv, "d", &hSize, DEFAULT_EMBEDDING_SIZE);
    LoadParamFloat(argc, argv, "outputminmax", &minmax, 0.08F);

-    InitTensor2D(&w, hSize, vSize, X_FLOAT, devID, mem);
+    InitTensor2DV2(&w, hSize, vSize, X_FLOAT, devID);
    
    float scale = 1.0F;
    float finfout = (float)sqrt(6.0F * scale/(hSize + vSize));

--- a/source/sample/transformer/T2TOutput.h
+++ b/source/sample/transformer/T2TOutput.h
@@ -38,9 +38,6 @@ public:
    /* device id */
    int devID;

-    /* memory pool */
-    XMem * mem;
-
    /* vocabulary size */
    int vSize;

@@ -61,7 +58,7 @@ public:
    ~T2TOutput();

    /* initialize the model */
-    void InitModel(int argc, char ** argv, int myDevID = -1, XMem * myMem = NULL);
+    void InitModel(int argc, char ** argv, int myDevID = -1);

    /* make the network */
    XTensor Make(XTensor &input);

--- a/source/sample/transformer/T2TPredictor.cpp
+++ b/source/sample/transformer/T2TPredictor.cpp
@@ -105,9 +105,9 @@ void T2TPredictor::Create(T2TModel * model, XTensor * top, const XTensor * input
        dims[i] = input->GetDim(i);
    dims[input->order - 1] = beamSize;

-    InitTensor(&state->probPath, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
-    InitTensor(&state->nstep, input->order, dims, X_FLOAT, 1.0F, input->devID, input->mem);
-    InitTensor(&state->endMark, input->order, dims, X_INT, 1.0F, input->devID, input->mem);
+    InitTensorV2(&state->probPath, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->nstep, input->order, dims, X_FLOAT, input->devID);
+    InitTensorV2(&state->endMark, input->order, dims, X_INT, input->devID);

    state->probPath.SetZeroAll();
    state->nstep.SetZeroAll();
@@ -170,7 +170,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
        dims[i] = inputEnc->GetDim(i);
    dims[inputEnc->order - 1] = 1;

-    InitTensor(&first, inputEnc->order, dims, X_INT, 1.0F, inputEnc->devID, inputEnc->mem);
+    InitTensorV2(&first, inputEnc->order, dims, X_INT, inputEnc->devID);
    _SetDataFixed(&first, startSymbol);

    /* add a new word into the input sequence of the decoder side */
@@ -179,7 +179,7 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    }
    else{
        inputDec = GeneratePaths(s);
-        inputDec.SetDevice(inputEnc->devID, inputEnc->mem);
+        inputDec.SetDevice(inputEnc->devID);

        inputDec = Concatenate(first, inputDec, inputDec.order - 1);
    }
@@ -194,8 +194,8 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    dims[inputDec.order - 1] = inputDec.GetDim(-1);
    
    XTensor paddingDec;
-    InitTensor(&paddingDec, inputDec.order, dims, X_INT, 1.0F, paddingEnc->devID, paddingEnc->mem);
-    SetDataFixedInt(paddingDec, 1);
+    InitTensorV2(&paddingDec, inputDec.order, dims, X_INT, paddingEnc->devID);
+    SetDataFixed(paddingDec, 1);
    
    XTensor maskDec;
    XTensor maskEncDec;
@@ -213,14 +213,14 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,

    int stride = decoding.GetDim(decoding.order - 2);

-    InitTensor1D(&selectSrc, 1, X_INT);
-    InitTensor1D(&selectTgt, 1, X_INT);
+    InitTensor1DV2(&selectSrc, 1, X_INT);
+    InitTensor1DV2(&selectTgt, 1, X_INT);

    selectSrc.SetInt(stride - 1, 0);
    selectTgt.SetInt(0, 0);

-    selectSrc.SetDevice(decoding.devID, decoding.mem);
-    selectTgt.SetDevice(decoding.devID, decoding.mem);
+    selectSrc.SetDevice(decoding.devID);
+    selectTgt.SetDevice(decoding.devID);
    
    /* the decoder output of the last position */
    decodingStep = CopyIndexed(decoding, decoding.order - 2, selectSrc, selectTgt);
@@ -228,8 +228,6 @@ void T2TPredictor::Predict(T2TStateBundle * next, XTensor * encoding,
    /* generate the output probabilities */
    m->outputLayer->Make(decodingStep, output);
    
-    _LogMe(&output);
-    
    next->layersEnc.AddList(&s->layersEnc);
    next->layersDec.Add(&inputDec);
    next->layersDec.Add(&output);
@@ -259,7 +257,7 @@ XTensor T2TPredictor::GeneratePaths(T2TStateBundle * state)
    }

    XTensor path;
-    InitTensor2D(&path, state->stateNum, distance, X_INT);
+    InitTensor2DV2(&path, state->stateNum, distance, X_INT);
    path.SetZeroAll();

    for(int i = 0; i < state->stateNum; i++){

--- a/source/sample/transformer/T2TSearch.cpp
+++ b/source/sample/transformer/T2TSearch.cpp
--- a/source/sample/transformer/T2TSearch.h
+++ b/source/sample/transformer/T2TSearch.h
@@ -73,8 +73,7 @@ public:
    void Init(int argc, char ** argv);

    /* search for the most promising states */
-    void Search(T2TModel * model, XTensor * input, XTensor * padding, 
-                XTensor * output, XTensor * score);
+    void Search(T2TModel * model, XTensor * input, XTensor * padding, XTensor * output);

    /* preparation */
    void Prepare(int myBatchSize,int myBeamSize);
@@ -95,7 +94,7 @@ public:
    void FillHeap(T2TStateBundle * beam);

    /* save the output sequences in a tensor */
-    void Dump(XTensor * output, XTensor * score);
+    void Dump(XTensor * output);

    /* check if the token is an end symbol */
    bool IsEnd(int token);
@@ -103,9 +102,6 @@ public:
    /* set end symbols for search */
    void SetEnd(const int * tokens, const int tokenNum);

-    /* check whether all hypotheses are completed */
-    bool IsAllCompleted(T2TStateBundle * beam);
-
    /* make a mask to prevent duplicated entries in beam expansion for the first position */
    XTensor MakeFirstMask(T2TStateBundle * beam);
 };

--- a/source/sample/transformer/T2TTester.cpp
+++ b/source/sample/transformer/T2TTester.cpp
@@ -75,7 +75,6 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    CheckNTErrors(ofile, "Cannot open the output file");

    int devID = model->devID;
-    XMem * mem = model->mem;

    XNet net;
    
@@ -106,15 +105,14 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    while(batchLoader.LoadBatch(file, model->isLM, 
                                &batchEnc, &paddingEnc, &paddingDec, &paddingDec, &gold, &label,
                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, mem, false))
+                                1, 1, false, ws, wc, devID, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch!");
        CheckNTErrors(!model->isLM, "Only MT model is supported!");
        
        XTensor output;
-        XTensor score;

-        seacher.Search(model, &batchEnc, &paddingEnc, &output, &score);
+        seacher.Search(model, &batchEnc, &paddingEnc, &output);

        Dump(ofile, &output);

@@ -130,7 +128,7 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
        if (batchCount % 1 == 0) {
            double elapsed = GetClockSec() - startT;
            XPRINT3(0, stderr, 
-                   "[INFO] elapsed=%.1fs, sent=%d, sword=%d\n",
+                   "[INFO] elapsed=%.1fs, sentence=%d, sword=%d\n",
                    elapsed, sentCount, wordCount);
        }
    }
@@ -142,8 +140,8 @@ void T2TTester::Test(const char * fn, const char * ofn, T2TModel * model)
    
    double elapsed = GetClockSec() - startT;

-    XPRINT4(0, stderr, "[INFO] test finished (took %.1fs, word=%d, sent=%d, and ppl=%.3f)\n",
-            elapsed,wordCountTotal, sentCount, exp(loss/wordCount));
+    XPRINT3(0, stderr, "[INFO] test finished (took %.1fs, word=%d, and ppl=%.3f)\n",
+            elapsed,wordCountTotal, exp(loss/wordCount));
 }

 /*

--- a/source/sample/transformer/T2TTrainer.cpp
+++ b/source/sample/transformer/T2TTrainer.cpp
@@ -75,9 +75,6 @@ void T2TTrainer::Init(int argc, char ** argv)
        strcpy(argArray[i], argv[i]);
    }

-    bool useMem = false;
-
-    LoadParamBool(argc, argv, "mem", &useMem, useMem);
    LoadParamFloat(argc, argv, "lrate", &lrate, 1.0F);
    LoadParamFloat(argc, argv, "lrbias", &lrbias, 0);
    LoadParamInt(argc, argv, "sbatch", &sBatchSize, 1);
@@ -142,7 +139,6 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
 #endif

    int devID = model->devID;
-    XMem * mem = model->mem;
    XNet net;

    if(isDebugged)
@@ -184,7 +180,7 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
        while (batchLoader.LoadBatch(file, model->isLM, 
                                     &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
                                     NULL, vSize, vSizeTgt,
-                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, mem, true)) 
+                                     sBatchSize, wBatchSize, isLenSorted, ws, wc, devID, true)) 
        {

            CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
@@ -321,7 +317,6 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    CheckNTErrors(ofile, "Cannot open the output file");

    int devID = model->devID;
-    XMem * mem = model->mem;

    XNet net;
    
@@ -351,7 +346,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)
    while(batchLoader.LoadBatch(file, model->isLM, 
                                &batchEnc, &paddingEnc, &batchDec, &paddingDec, &gold, &label,
                                seqs, vSize, vSizeTgt,
-                                1, 1, false, ws, wc, devID, mem, false))
+                                1, 1, false, ws, wc, devID, false))
    {
        CheckNTErrors(batchEnc.order == 2, "wrong tensor order of the sequence batch");
            
@@ -372,7 +367,7 @@ void T2TTrainer::Test(const char * fn, const char * ofn, T2TModel * model)

        /* prediction probabilities */
        XTensor probs;
-        InitTensor1D(&probs, bSize * length);
+        InitTensor1DV2(&probs, bSize * length);

        XTensor labelOnehot;

@@ -463,7 +458,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    
    /* probability of each word */
    XTensor wprobs;
-    InitTensor1D(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID, output->mem);
+    InitTensor1DV2(&wprobs, output->unitNum/output->GetDim(-1), X_FLOAT, output->devID);
    
    int dims[2] = {output->unitNum/output->GetDim(-1), output->GetDim(-1)};
    probs.Reshape(2, dims);
@@ -480,7 +475,7 @@ float T2TTrainer::GetProb(XTensor * output, XTensor * gold, XTensor * wordProbs)
    
    /* probability for the batch */
    XTensor result;
-    InitTensor1D(&result, 1, X_FLOAT, output->devID, output->mem);
+    InitTensor1DV2(&result, 1, X_FLOAT, output->devID);
    _ReduceSum(&probs, &result, 1);
    
    return result.Get1D(0);
@@ -527,7 +522,7 @@ void T2TTrainer::Update(T2TModel * model, const float lr)
            _ScaleAndShiftMe(v, (1.0F - adamBeta2), 0);

            /* v2 = m / (sqrt(v) + delta) */
-            XTensor * v2 = NewTensorBuf(v, v->devID, v->mem);
+            XTensor * v2 = NewTensorBufV2(v, v->devID);
            _Power(v, v2, 0.5F);
            _ScaleAndShiftMe(v2, 1.0F, d);
            _Div(m, v2, v2);
@@ -598,7 +593,7 @@ void T2TTrainer::PadOutput(XTensor * output, XTensor * gold, XTensor * padding)

    output->Reshape(output->unitNum/dimso[output->order - 1], dimso[output->order - 1]);

-    XTensor * padding2 = NewTensorBuf(1, &padding->unitNum, X_FLOAT, 1.0F, padding->devID, padding->mem);
+    XTensor * padding2 = NewTensorBufV2(1, &padding->unitNum, X_FLOAT, padding->devID);

    _CopyValues(padding, padding2);
    _MultiplyDim(output, padding2, output, 0);
@@ -652,7 +647,7 @@ void T2TTrainer::LabelSmooth(XTensor * gold, XTensor * smoothed, DTYPE p)
    DTYPE q = 1.0F - p;
    DTYPE gift = p / n;
    
-    InitTensor(smoothed, gold);
+    InitTensorV2(smoothed, gold);
    _CopyValues(gold, smoothed);
    
    if(p == 0)

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -60,7 +60,7 @@ TENSOR_DATA_TYPE GetDataType(const char * typeName)
    }
 }

-/****************************************************
+/*
 Below is for calling CPU BLAS for fast matrix operations
 I'm not sure how fast it is. But it seems that other
 guys are crazy about this. So I decided to have a try.
@@ -81,35 +81,4 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
    return f;
 }

-/* 
-data type conversion
->> devID - device id
->> s - source data array
->> typeS - source data type
->> t - target data array
->> typeT - target data type
->> size - number of the items in s (and t)
-*/
-void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
-{
-    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
-
-    if(typeS == typeT)
-        return;
-
-    if(typeS == X_FLOAT && typeT == X_FLOAT16){
-        for(int i = 0; i < size; i++){
-            ((unsigned short*)t)[i] = FloatToFloat16(((float*)s)[i]);
-        }
-    }
-    else if(typeS == X_FLOAT16 && typeT == X_FLOAT){
-        for(int i = 0; i < size; i++){
-            ((float*)t)[i] = Float16ToFloat(((unsigned short*)s)[i]);
-        }
-    }
-    else{
-        ShowNTErrors("Unsupported data types for conversion!");
-    }
-}
-
 } /* end of the nts (NiuTrans.Tensor) namespace */
--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -49,15 +49,6 @@ extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
 unsigned short FloatToFloat16(float f);
 float Float16ToFloat(unsigned short h);
-void ConvertDataType(int devID, 
-                     void * s, TENSOR_DATA_TYPE typeS, 
-                     void * t, TENSOR_DATA_TYPE typeT, int size);
-
-#ifdef USE_CUDA
-void CudaConvertDataType(int devID, 
-                         void * s, TENSOR_DATA_TYPE typeS, 
-                         void * t, TENSOR_DATA_TYPE typeT, int size);
-#endif

 } /* end of the nts (NiuTrans.Tensor) namespace */


--- a/source/tensor/XGlobal.cpp
+++ b/source/tensor/XGlobal.cpp
@@ -51,7 +51,13 @@ bool CONST_TRUE = true;

 int verboseLevel = 0;
 bool useBLAS = false;
-bool useCUDA = false;
+
+
+#ifdef USE_CUDA
+    bool useCUDA = true;
+#else
+    bool useCUDA = false;
+#endif

 FILE * tmpLog = NULL;
 double myTime = 0;

--- a/source/tensor/XGlobal.h
+++ b/source/tensor/XGlobal.h
@@ -45,10 +45,6 @@ typedef int8_t __int8;
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts {

-#if (__cplusplus >= 201103L || _MSC_VER >= 1700)
-#define USE_CPP11
-#endif
-
 #define _XINLINE_  

 //#define DOUBELPRICSION

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -101,7 +101,6 @@ void TensorListBase<T>::Add(T&& item)
        maxNum = maxNum * 2 + 1;
    }
    items[count++] = item;
-	
 }

 /*

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
--- a/source/tensor/XMem.cpp
+++ b/source/tensor/XMem.cpp
@@ -1596,9 +1596,9 @@ void XMemManager::Initialize()
 /* free it */
 void XMemManager::Free()
 {
-    for (int i = 0; i < MAX_CPU_NUM; i++)
+    for (int i = 0; i < MAX_CPU_MEM_NUM; i++)
        CPUMems[i].Free();
-    for (int i = 0; i < MAX_GPU_NUM; i++)
+    for (int i = 0; i < MAX_GPU_MEM_NUM; i++)
        GPUMems[i].Free();
 }


--- a/source/tensor/XMem.h
+++ b/source/tensor/XMem.h
@@ -60,10 +60,10 @@ typedef long long          INT_64;
 #define CUDA_HOST_MALLOC 1
 #define MY_PITCH CUDA_PITCH
 #define BUF_PITCH 256
-#define MIN_BLOCK_SIZE_FOR_MEMPOOL 128 * 1024 * 1024
+#define MIN_BLOCK_SIZE_FOR_MEMPOOL 256 * 1024 * 1024
 #define MIN_BLOCK_NUM_FOR_MEMPOOL 1024
-#define MAX_CPU_NUM 16
-#define MAX_GPU_NUM 16
+#define MAX_CPU_MEM_NUM 16
+#define MAX_GPU_MEM_NUM 16

 /* 
 mode of runnig a memory pool 
@@ -434,13 +434,13 @@ class XMemManager
 {
 private:
    /* cpu memory pool information */
-    XMem CPUMems[MAX_CPU_NUM];
+    XMem CPUMems[MAX_CPU_MEM_NUM];

    /* number of cpu memory pools */
    int nCPUMem;

    /* gpu memory pool information */
-    XMem GPUMems[MAX_GPU_NUM];
+    XMem GPUMems[MAX_GPU_MEM_NUM];

    /* number of gpu memory pools */
    int nGPUMem;

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -15,7 +15,7 @@
 * limitations under the License.
 */

- /*
+/*
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
 */

@@ -24,9 +24,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
    
 /* get operator name */
-	const char * GetOPName(int type)
-	{
-		if ((type & MATH_BASE) != 0) {
+const char * GetOPName(int type)
+{
+    if ((type & MATH_BASE) != 0){
        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
        else if (type == MATH_CEIL)
@@ -59,6 +59,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
            return "M_DIV";
        else if (type == MATH_DIVDIM)
            return "M_DIVDIM";
+        else if (type == MATH_MASK)
+            return "M_MASK";
        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
        else if (type == MATH_MATRIXMULBATCHED)
@@ -109,6 +111,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
            return "R_REDUCEVARIANCE";
    }
 		else if ((type & DATA_BASE) != 0) {
+		if (type == GETANDSET_CONVERTDATATYPE)
+			return "G_CONVERTDATATYPE";
+		else if (type == GETANDSET_INDEXTOONEHOT)
+			return "G_INDEXTOONEHOT";
+		else if (type == GETANDSET_ONEHOTTOINDEX)
+			return "G_ONEHOTTOINDEX";
+		else if (type == GETANDSET_SELECT)
+			return "G_SELECT";
+	}
+    else if ((type & SHAPE_BASE) != 0){
        if (type == GETANDSET_SELECT)
            return "G_SELECT";
        else if (type == MOVEMENT_COPYINDEXED)
@@ -144,7 +156,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
        else if (type == SORT_TOPK)
            return "S_TOPK";
    }
-		else if ((type & FUNCTION_BASE) != 0) {
+    else if ((type & FUNCTION_BASE) != 0){
        if (type == FUNC_DROPOUT)
            return "F_DROPOUT";
        else if (type == FUNC_HARDTANH)
@@ -166,7 +178,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
    }
    
    return "NULL";
-	}
+}
    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -15,13 +15,13 @@
 * limitations under the License.
 */

- /*
+/*
 *
 * We define various names here
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
 * It was really HOT these days. I can't imagine it is SO hot here in Shenyang!
- */
+*/

 #ifndef __XNAME_H__
 #define __XNAME_H__
@@ -31,7 +31,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* math operations */
 #define MATH_BASE               0x00001000

-#define GETANDSET_CONVERTDATATYPE MATH_BASE * 8
 #define MATH_ABSOLUTE           MATH_BASE + 1
 #define MATH_CEIL               MATH_ABSOLUTE + 1
 #define MATH_EXP                MATH_CEIL + 1
@@ -49,7 +48,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_CLIP               MATH_ROUND + 1
 #define MATH_DIV                MATH_CLIP + 1
 #define MATH_DIVDIM             MATH_DIV + 1
-#define MATH_MATRIXMUL          MATH_DIVDIM + 1
+#define MATH_MASK               MATH_DIVDIM + 1
+#define MATH_MATRIXMUL          MATH_MASK + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
 #define MATH_MULTIPLYDIM        MATH_MULTIPLY + 1
@@ -80,9 +80,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* data and shape related operations */
 #define DATA_BASE               MATH_BASE * 2
 #define GETANDSET               DATA_BASE + 1
-#define GETANDSET_SELECT        GETANDSET + 1
+#define GETANDSET_CONVERTDATATYPE GETANDSET + 1
+#define GETANDSET_INDEXTOONEHOT GETANDSET_CONVERTDATATYPE + 1
+#define GETANDSET_ONEHOTTOINDEX GETANDSET_INDEXTOONEHOT + 1
+#define GETANDSET_SELECT        GETANDSET_ONEHOTTOINDEX + 1
+
+#define SHAPE_BASE              DATA_BASE * 2

-#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT                SHAPE_BASE + 1
 #define MOVEMENT_COPYINDEXED    MOVEMENT + 1
 #define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
 #define MOVEMENT_GATHER         MOVEMENT_COPYVALUES + 1
@@ -105,7 +110,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SORT_TOPK               SORT_SORT + 1

 /* activation functions */
-#define FUNCTION_BASE           DATA_BASE * 2
+#define FUNCTION_BASE           SHAPE_BASE * 2
 #define FUNC_DROPOUT            FUNCTION_BASE + 1
 #define FUNC_HARDTANH           FUNC_DROPOUT + 1
 #define FUNC_IDENTITY           FUNC_HARDTANH + 1
@@ -118,7 +123,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define LOSS_CROSSENTROPY       LOSS_BASE + 1

 /* get operator name */
-	const char * GetOPName(int type);
+const char * GetOPName(int type);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/XQueue.cpp
+++ b/source/tensor/XQueue.cpp
--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -48,6 +48,7 @@
 #include "core/math/ScaleAndShift.h"
 #include "core/getandset/SetData.h"
 #include "function/Identity.h"
+#include "core/CHeader.h"

 #ifdef USE_CUDA

@@ -279,6 +280,7 @@ void XTensor::Init()
    isTmp =  false;
    isGrad = false;
    isVar  = false;
+    enableGrad = false;
    visitMark = 0;
    grad = NULL;
 }
@@ -309,6 +311,7 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 {
    strcpy(name, tensor.name);
    order = tensor.order;
+    enableGrad = tensor.enableGrad;
    memcpy(dimSize, tensor.dimSize, sizeof(int) * MAX_TENSOR_DIM_NUM);
    memcpy(dimSizeRDI, tensor.dimSizeRDI, sizeof(int) * MAX_TENSOR_DIM_NUM);
    dataType = tensor.dataType;
@@ -483,6 +486,12 @@ XTensor XTensor::operator- (const DTYPE shift) const
    return ScaleAndShift(*this, 1, -shift);
 }

+/* overloading of the minus-sign */
+XTensor XTensor::operator- () const
+{
+    return Negate(*this);
+}
+
 /* overloading of the division-sign */
 XTensor XTensor::operator/ (const XTensor& tensor) const
 {
@@ -517,7 +526,7 @@ void XTensor::SetDevice(int myDevId, XMem * myMem)
        isInGlobalMem = false;
    }
    else {
-        ShowNTErrors("TODO!");
+        myMem = GMems.GetMem(myDevId);
    }
 }

@@ -835,6 +844,12 @@ void XTensor::SetData(const void * d, int num, int beg)
    XMemCopy((char*)data + beg * unitSize, devID, d, -1, num * unitSize);
 }

+/* generate data items with a uniform distribution in [0, 1] */
+void XTensor::Rand(int rNum, int cNum)
+{
+    _SetDataRand(this, rNum, cNum);
+}
+
 /* 
 set the tensor items by a uniform distribution in range [lower, upper]
 >> lower - lower value of the range
@@ -2168,6 +2183,11 @@ void InitTensorV2(XTensor * tensor,
                const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
                const int myDevID)
 {
+    if (tensor->mem == NULL) {
+        XMem * myMem = GMems.GetMem(myDevID);
+        tensor->mem = myMem;
+        tensor->devID = myMem->devID;
+    }
    if(tensor->mem != NULL){
        tensor->Resize(myOrder, myDimSize, myDataType, 1.0F);
    }
@@ -2440,6 +2460,7 @@ void InitTensor(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensor(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->denseRatio, 
               reference->devID, reference->mem);
@@ -2455,6 +2476,7 @@ void InitTensorV2(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;

+    tensor->enableGrad = reference->enableGrad;
    InitTensorV2(tensor, reference->order, reference->dimSize, 
               reference->dataType, reference->devID);
 }
@@ -2469,9 +2491,9 @@ void InitTensorOnCPU(XTensor * tensor, const XTensor * reference)
    if(reference->order < 0)
        return;
    
-    InitTensor(tensor, reference->order, reference->dimSize,
-               reference->dataType, reference->denseRatio,
-               -1);
+    tensor->enableGrad = reference->enableGrad;
+    InitTensorV2(tensor, reference->order, reference->dimSize,
+               reference->dataType, -1);
 }
    
 /* generate a XTensor with no initialization */
@@ -2574,7 +2596,7 @@ XTensor * NewTensorBufV2(const int myOrder, const int * myDimSize,

    dims[0] = -abs(dims[0]);

-    XTensor * tensor = NewTensor(myOrder, dims, myDataType, 1.0F, devID);
+    XTensor * tensor = NewTensorV2(myOrder, dims, myDataType, devID);

    if (tensor->unitNum * tensor->unitSize == 176657664) {
        tensor->Dump(stderr, "", 200);

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -151,6 +151,9 @@ public:
    /* indicates whether the tensor keeps the gradient when used as model parameters */
    bool isGrad;

+    /* indicates whether the gradient of the tensor should be computed */
+    bool enableGrad;
+
    /* indicates whether the tensor is used as paramters (or variables) */
    bool isVar;

@@ -235,6 +238,9 @@ public:
    /* overloading of the minus-sign */
    XTensor  operator- (const DTYPE shift) const;

+    /* overloading of the minus-sign */
+    XTensor  operator- () const;
+
    /* overloading of the division-sign */
    XTensor  operator/ (const XTensor &tensor) const;
    
@@ -298,6 +304,9 @@ public:
    /* set the tensor with an data array */
    void SetData(const void * d, int num, int beg = 0);

+    /* generate data items with a uniform distribution in [0, 1] */
+    void Rand(int rNum, int cNum);
+
    /* set tensor items by a uniform distribution */
    void SetDataRand(DTYPE lower = 0.0F, DTYPE upper = 1.0F);


--- a/source/tensor/core/CHeader.h
+++ b/source/tensor/core/CHeader.h
@@ -36,13 +36,9 @@
 #include "arithmetic/MatrixMulBatched.h"
 #include "arithmetic/Multiply.h"
 #include "arithmetic/MultiplyDim.h"
-#include "arithmetic/Negate.h"
-#include "arithmetic/Sign.h"
 #include "arithmetic/Sub.h"
 #include "arithmetic/SubDim.h"
 #include "arithmetic/Sum.h"
-#include "arithmetic/SumByColumnTV.h"
-#include "arithmetic/SumByColumnVT.h"
 #include "arithmetic/SumDim.h"
 #include "arithmetic/XTensorBLAS.h"
 #include "arithmetic/MulAndShift.h"
@@ -56,7 +52,6 @@
 #include "math/Clip.h"
 #include "math/Compare.h"
 #include "math/Normalize.h"
-#include "math/Power.h"
 #include "math/ScaleAndShift.h"
 #include "math/Unary.h"

@@ -97,5 +92,4 @@
 #include "utilities/XMatrixSegment.h"
 #include "utilities/FlushToMem.h"

-#include "../function/DropoutWithIndex.h"
 #endif // __CHEADER_H__
--- a/source/tensor/core/arithmetic/Div.cpp
+++ b/source/tensor/core/arithmetic/Div.cpp
@@ -143,6 +143,23 @@ void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }

 /*
+element-wise division of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void DivMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Div(&a, &b, &a, alpha, leadingDim);
+}
+
+/* 
 return a dimension if the division is performed as DivDim (in more details in DivDim.h)
 >> a - a tensor
 >> b - another tensor for division
@@ -229,9 +246,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -245,7 +261,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _Div function */
        _Div(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIV);
            XLink::AddParamToHead(&c, alpha);
@@ -256,7 +272,7 @@ void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadin
        /* call _DivDim function */
        _DivDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Div.cu
+++ b/source/tensor/core/arithmetic/Div.cu
--- a/source/tensor/core/arithmetic/Div.h
+++ b/source/tensor/core/arithmetic/Div.h
@@ -40,6 +40,7 @@ a(i) = a(i)/b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _DivMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void DivMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise division of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise division of two tensors:
 c(i) = a(i)/b(i) + \alpha * c(i)
 where i is the index of the element
 */
-void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Div(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/DivDim.cpp
+++ b/source/tensor/core/arithmetic/DivDim.cpp
@@ -183,9 +183,8 @@ i.e., a is divided with b by broadcasting
 >> c - where we put result. we save it in a if c is NULL
 >> n - the dimension index
 >> alpha - the scaling factor
->> requireLink - if add operation to network
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, bool requireLink)
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha, 
    /* call _Div function */
    _DivDim(&a, &b, &c, n, alpha);

-    if (requireLink) {
+    if (c.enableGrad == true) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_DIVDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/DivDim.cu
+++ b/source/tensor/core/arithmetic/DivDim.cu
@@ -17,6 +17,7 @@

 /*
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-08-15
+ * $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-15 float16 added
 */

 #include "DivDim.cuh"
@@ -168,6 +169,34 @@ void _CudaDivDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
            ShowNTErrors("Something is wrong!");
        }
    }
+    else if (a->dataType == X_FLOAT16) {
+        half alpha1 = __float2half(alpha);
+        if (stride > 1){
+            GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
+            if (alpha == (DTYPE)0.0F)
+                KernelDivWithCol<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                   blockSize, stride, blockSize * stride, blockNum, alpha1);
+            else
+                KernelDivWithCol<__half, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                   blockSize, stride, blockSize * stride, blockNum, alpha1);
+        }
+        else if (stride == 1){
+            GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+            if (alpha == (DTYPE)0.0F)
+                KernelDivWithRow<__half, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                   blockNum, blockSize, alpha1);
+            else
+                KernelDivWithRow<__half, true>  <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
+                                                 ((__half*)a->data, (__half*)b->data, (__half*)c->data,
+                                                   blockNum, blockSize, alpha1);
+        }
+        else {
+            ShowNTErrors("Something is wrong!");
+        }
+    }
    else {
        ShowNTErrors("TODO!");
    }

--- a/source/tensor/core/arithmetic/DivDim.h
+++ b/source/tensor/core/arithmetic/DivDim.h
@@ -59,7 +59,7 @@ c(i) = a/b + \alpha * c
 where the size of b is equal to the n-th dimension of a, 
 i.e., a is divided with b by broadcasting 
 */
-void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0, bool requireLink = false);
+void DivDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE alpha = (DTYPE)0.0);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Mask.cpp
+++ b/source/tensor/core/arithmetic/Mask.cpp
@@ -130,6 +130,17 @@ void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha)
 }

 /*
+mask entries of a given tensor (on site):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void MaskMe(XTensor& a, const XTensor& mask, DTYPE alpha)
+{
+    _Mask(&a, &mask, &a, alpha);
+}
+
+/*
 mask entries of a given tensor (return an XTensor structure):
 a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
@@ -140,16 +151,35 @@ XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha)
    XTensor c(&a);
    c.SetTMPFlag();

-    /* call _Sum function */
+    /* call _Mask function */
    _Mask(&a, &mask, &c, alpha);

    /* tensor connections */
-    //XLink::MakeLink(&a, &mask, &c, MATH_SUM);
-    //XLink::AddParamToHead(&c, alpha);
-    // TODO!!
-    ShowNTErrors("TODO!");
+    XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+    XLink::AddParamToHead(&c, alpha);

    return c;
 }

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha)
+{
+    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
+        InitTensor(&c, &a);
+    }
+
+    /* call _Mask function */
+    _Mask(&a, &mask, &c, alpha);
+
+    if (c.enableGrad) {
+        XLink::MakeLink(&a, &mask, &c, MATH_MASK);
+        XLink::AddParamToHead(&c, alpha);
+    }
+}
+
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Mask.h
+++ b/source/tensor/core/arithmetic/Mask.h
@@ -34,7 +34,7 @@ c(i) = a(i) if mask(i) is non-zero
 c(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha);
+void _Mask(const XTensor * a, const XTensor * mask, XTensor * c, DTYPE alpha = 0.0);

 /* 
 mask entries of a given tensor (on site):
@@ -42,7 +42,8 @@ a(i) = a(i) if mask(i) is non-zero
 a(i) = alpha if mask(i) = 0
 where i is the index of the element
 */
-void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha);
+void _MaskMe(XTensor * a, const XTensor * mask, DTYPE alpha = 0.0);
+void MaskMe(XTensor & a, const XTensor & mask, DTYPE alpha = 0.0);

 /*
 mask entries of a given tensor (return an XTensor structure):
@@ -52,6 +53,14 @@ where i is the index of the element
 */
 XTensor Mask(const XTensor &a, const XTensor &mask, DTYPE alpha = 0.0);

+/*
+mask entries of a given tensor (return an XTensor structure):
+a(i) = a(i) if mask(i) is non-zero
+a(i) = alpha if mask(i) = 0
+where i is the index of the element
+*/
+void Mask(const XTensor &a, const XTensor &mask, XTensor &c, DTYPE alpha = 0.0);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __MASK_H__
--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -54,8 +54,6 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a && b && c, "Empty input tensors!");
-    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-                  "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,
                  "Input tensors must have a order >= 2!");
    CheckNTErrors(c->order == a->order + b->order - 2, "wrong tensor order")
@@ -202,7 +200,9 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

-bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c)
+bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                    const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                    XTensor * c)
 {
    if (!(a && b && c))
        return false;
@@ -231,10 +231,13 @@ bool CheckMMulShape(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTen
    dimSize[sub++] = bm;

    for (int i = 0; i < order; i++) {
-        if (dimSize[i] != c->dimSize[i])
+        if (dimSize[i] != c->dimSize[i]) {
+            delete[] dimSize;
            return false;
        }
+    }
    
+    delete[] dimSize;
    return true;
 }

@@ -357,11 +360,9 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,

    return c;
 }
-
-
 void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
               const XTensor &b, MATRIX_TRANS_TYPE transposedB, XTensor &c, 
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -394,9 +395,9 @@ void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA,
    }

    /* call _MatrixMul function */
-    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, transposedA);
@@ -457,7 +458,7 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,
 }

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
-    DTYPE alpha, XPRunner * parallelRunner, bool requireLink)
+               DTYPE alpha, XPRunner * parallelRunner)
 {
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -492,7 +493,7 @@ void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c,
    /* call _MatrixMul function */
    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
        XLink::AddParamToHeadTrans(&c, X_NOTRANS);

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -40,8 +40,11 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, 
+                const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                XTensor * c,
+                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+                XPRunner * parallelRunner = NULL);

 /* 
 matrix multiplication (return an XTensor structure) c = trans(a) * trans(b) * alpha
@@ -56,22 +59,28 @@ bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of th
 C should be a tensor of z * x * n * m. 
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
-                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                  TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
-
-void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    XTensor &c, DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha = (DTYPE)1.0, 
+                  XPRunner * parallelRunner = NULL);
+				  
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                  TENSOR_DATA_TYPE dataType, DTYPE alpha = (DTYPE)1.0, 
+				  XPRunner * parallelRunner = NULL);
+
+void MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
+               const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+               XTensor &c, 
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, 
+               XPRunner * parallelRunner = NULL);

 /* matrix multiplication with no transposition c = a * b * alpha*/
 XTensor MatrixMul(const XTensor &a, const XTensor &b, 
                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 void MatrixMul(const XTensor &a, const XTensor &b, XTensor &c, 
-    DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL, bool requireLink = false);
-
+               DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -154,7 +154,7 @@ void _MatrixMulBatchedCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                          XTensor * c, DTYPE alpha, DTYPE beta)
 {
-CheckNTErrors((a && b && c), "Empty input tensors!");
+    CheckNTErrors(a && b && c, "Empty input tensors!");
    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
                 "Input tensors should have the same data type!");
    CheckNTErrors(a->order >= 2 && b->order >= 2 && c->order >= 2,

--- a/source/tensor/core/arithmetic/MulAndShift.cpp
+++ b/source/tensor/core/arithmetic/MulAndShift.cpp
@@ -129,9 +129,6 @@ XTensor MulAndShift(const XTensor &x, const XTensor &w, const XTensor &b,
    DelTensorBuf(tmp);

    return c;
-
 }

-
-
 }
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MulAndShift.h
+++ b/source/tensor/core/arithmetic/MulAndShift.h
--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -144,6 +144,23 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }

 /*
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
+a(i) = a(i)*b(i) + \alpha * a(i)
+where i is the index of the item
+
+>> a - tensor a (where keep the result)
+>> b - tensor b
+>> alpha - the coefficient
+>> leadingDim - the dimension along which we perform broadcasting
+*/
+void MultiplyMe(XTensor& a, const XTensor& b, DTYPE alpha, int leadingDim)
+{
+    _Multiply(&a, &b, &a, alpha, leadingDim);
+}
+
+/* 
 return a dimension if the multiplication is performed as MultiplyDim (in more details in MultiplyDim.h)
 >> a - a tensor
 >> b - another tensor for multiplication
@@ -230,9 +247,8 @@ where i is the index of the item
 >> c - result tensor
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->> requireLink - if add operation to network
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim, bool requireLink)
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int leadingDim)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -246,7 +262,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _Multiply function */
        _Multiply(&a, &b, &c, 0, leadingDim);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
            XLink::AddParamToHead(&c, alpha);
@@ -257,7 +273,7 @@ void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha, int l
        /* call _MultiplyDim function */
        _MultiplyDim(&a, &b, &c, n, alpha);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -123,9 +123,9 @@ where i is the item index
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
    int leadingDimRDI = a->order - leadingDim - 1;
-    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
+    CheckNTErrors(a->unitNum <= c->unitNum && b->unitNum <= c->unitNum,
                  "Unmatched tensors in multiplication!");
-    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    CheckNTErrors(a->order == b->order && a->order == c->order, "Unmatched tensors!");

    int stride = 1;
    int blockSizeA = 1;

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -40,6 +40,7 @@ a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the element 
 */
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0.0, int leadingDim = 0);
+void MultiplyMe(XTensor & a, const XTensor & b, DTYPE alpha = 0.0, int leadingDim = 0);

 /* 
 element-wise product of two tensors (return an XTensor structure)
@@ -54,7 +55,7 @@ element-wise product of two tensors:
 c(i) = a(i)*b(i) + \alpha * c(i) 
 where i is the index of the element
 */
-void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0, bool requireLink = false);
+void Multiply(const XTensor &a, const XTensor &b, XTensor &c, DTYPE alpha = 0.0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MultiplyDim.cpp
+++ b/source/tensor/core/arithmetic/MultiplyDim.cpp
@@ -139,6 +139,24 @@ void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha)
 }

 /*
+tensor multiplication(do it on site)
+make a new tensor to keep the result and return it
+
+c = a * b + \alpha * c
+where the size of b is equal to the n-th dimension of a,
+i.e., a is multiplied with b by broadcasting
+
+>> a - a tensor
+>> b - another tensor whose size is equal to that of dimension n of a
+>> n - the dimension index
+>> alpha - the scaling factor
+*/
+void MultiplyDimMe(XTensor& a, const XTensor& b, int n, DTYPE alpha)
+{
+    _MultiplyDim(&a, &b, &a, n, alpha);
+}
+
+/*
 tensor multiplication (return an XTensor structure and make tensor connections)
 make a new tensor to keep the result and return it

@@ -180,9 +198,8 @@ i.e., a is multiplied with b by broadcasting
 >> b - another tensor whose size is equal to that of dimension n of a
 >> c - where we put a * b + \alpha * c. we save it in a if c is NULL
 >> n - the dimension index
->> requireLink - if add operation to network
 */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink)
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -191,7 +208,7 @@ void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool req
    /* call _Multiply function */
    _MultiplyDim(&a, &b, &c, n, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -347,9 +364,8 @@ where some of dimensions of b can be of size 1
 >> a - a tensor
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
->> requireLink - if add operation to network
 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink)
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -358,7 +374,7 @@ void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requ
    /* call _SumBroadcast function */
    _MultiplyBroadcast(&a, &b, &c, 0);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_MULTIPLYBROADCAST);
        XLink::AddParamToHead(&c, 0);

--- a/source/tensor/core/arithmetic/MultiplyDim.cu
+++ b/source/tensor/core/arithmetic/MultiplyDim.cu
@@ -17,6 +17,7 @@

 /*
 * $Created by: JIANG Yufan (email: jiangyufan2018@outlook.com) 2018-08-14
+  * $Updated by: LinYe (email: linye2015@outlook.com) 2019-07-30 float16 added
 */

 #include "../../XDevice.h"

--- a/source/tensor/core/arithmetic/MultiplyDim.h
+++ b/source/tensor/core/arithmetic/MultiplyDim.h
@@ -33,6 +33,7 @@ void _MultiplyDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYP
 /* tensor multiplication a = a * b + \alpha * c where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. we keep the result in the input tensor a and return nothing */
 void _MultiplyDimMe(XTensor * a, const XTensor * b, int n, DTYPE alpha = 0.0);
+void MultiplyDimMe(XTensor & a, const XTensor & b, int n, DTYPE alpha = 0.0);

 /* tensor multiplication c = a * b where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting. We make a new tensor c to keep the result and return it */
@@ -40,7 +41,7 @@ XTensor MultiplyDim(const XTensor &a, const XTensor &b, int n);

 /* tensor multiplication c = a * b + \alpha * c  where the size of b is equal to the n-th dimension of a,
   i.e., a is multiplied with b by broadcasting */
-void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n, bool requireLink = false);
+void MultiplyDim(const XTensor &a, const XTensor &b, XTensor &c, int n);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
 void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -50,7 +51,7 @@ void _MultiplyBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE
 XTensor MultiplyBroadcast(const XTensor &a, const XTensor &b);

 /* tensor multiplication summation c = a * b + c * \beta where some of dimensions of b can be of size 1 */
-void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c, bool requireLink = false);
+void MultiplyBroadcast(const XTensor &a, const XTensor &b, XTensor &c);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Negate(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaNegate(a, b);
-    return;
-    }
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++)
-        db[i] = -d[i];
-}
-
-/*
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _NegateMe(XTensor * a)
-{
-    _Negate(a, a);
-}
-
-/*
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the minus value of input tensor
-*/
-XTensor Negate(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-    
-    /* call _Negate function */
-    _Negate(&a, &b);
-    
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    
-    return b;
-}
-
-/*
-set every entry to its minus value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Negate(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Negate function */
-    _Negate(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Negate.h"
-#include "Negate.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its negtive value (CUDA Kernel)
->> a - pointer to the input data array
->> b - pointer to the output data array
->> size - size of the data array
-*/
-template <class T>
-__global__
-void KernelNegate(T * a, T * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        b[i] = -a[i];
-}
-
-
-/*
-set each entry to its negtive value
->> a - input tensor
->> b - output tensor
-*/
-void _CudaNegate(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_CUH__
-#define __NEGATE_CUH__
-
-#include "Negate.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its negtive value (CUDA Kernel) */
-template <class T>
-__global__
-void KernelNegate(T * a, T * b, int size);
-
-/* set each entry to its negtive value */
-void _CudaNegate(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __NEGATE_H__
-#define __NEGATE_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its minus value */
-void _Negate(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its minus value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _NegateMe(XTensor * a);
-
-/* 
-set every entry to its minus value (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Negate(const XTensor & a);
-
-/* set every entry to its minus value */
-void Negate(const XTensor & a, XTensor & b, bool requireLink = false);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __NEGATE_H__
--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#include "../../XTensor.h"
-#include "../../XName.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _Sign(const XTensor * a, XTensor * b)
-{
-#ifdef USE_CUDA
-    /* run it on GPUs */
-    if (a->devID >= 0) {
-        _CudaSign(a, b);
-    return;
-}
-#endif
-
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
-    DTYPE * db = (DTYPE*)b->data;
-    for (int i = 0; i < a->unitNum; i++) {
-        if (d[i] > 0)
-            db[i] = 1.0F;
-        else if (d[i] == 0)
-            db[i] = 0.0F;
-        else
-            db[i] = -1.0F;
-    }
-}
-
-/*
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
->> a - the tensor we are processing
-*/
-void _SignMe(XTensor * a)
-{
-    _Sign(a, a);
-}
-
-/*
-set every entry to its sign value (return an XTensor structure)
-make a new tensor to keep the result and return it
->> a - input tensor we are processing
-<< return - the sign value of the input tensor
-*/
-XTensor Sign(const XTensor & a)
-{
-    XTensor b(&a);
-    b.SetTMPFlag();
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    /* tensor connections */
-    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-
-    return b;
-}
-
-/*
-set every entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
->> requireLink - if add operation to network
-*/
-void Sign(const XTensor & a, XTensor & b, bool requireLink)
-{
-    if (!b.isInit || !XTensor::IsSameShaped(&a, &b)) {
-        InitTensor(&b, &a);
-    }
-
-    /* call _Sign function */
-    _Sign(&a, &b);
-
-    if (requireLink) {
-        /* tensor connections */
-        XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
-    }
-}
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-* $Update by: Lin Ye (email: linye2015@outlook.com) 2019-07-24 float16 added
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "Sign.h"
-#include "Sign.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-/*
-set each entry to its sign value (CUDA Kernel)
->> a - pointer to input data array
->> b - pointer to output data array
->> size - size of the data array
-*/
-template<class T>
-__global__
-void KernelSign(T * a, T * b, int size)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size){
-        if (a[i] > (T)0)
-            b[i] = 1.0F;
-        else if (a[i] == (T)0)
-            b[i] = 0.0F;
-        else
-            b[i] = -1.0F;
-    }
-}
-
-/*
-set each entry to its sign value
->> a - input tensor we are processing
->> b - output tensor we are processing
-*/
-void _CudaSign(const XTensor * a, XTensor * b)
-{
-    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
-    CheckNTErrors((a->isSparse == false), "TODO!");
-
-    int gridSize[3];
-    int blockSize[3];
-
-    GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
-
-    dim3 blocks(gridSize[0]);
-    dim3 threads(blockSize[0]);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    if (a->dataType == DEFAULT_DTYPE) {
-        KernelSign<<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
-    }
-    else if (a->dataType == X_FLOAT16) {
-        KernelSign<<<blocks, threads>>>((__half*)a->data, (__half*)b->data, a->unitNum);
-    }
-    else {
-        ShowNTErrors("TODO!");
-    }
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_CUH__
-#define __SIGN_CUH__
-
-#include "Sign.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* set each entry to its sign value (CUDA Kernel) */
-template<class T>
-__global__
-void KernelSign(T * a, T * b, int size);
-
-/* set each entry to its sign value */
-void _CudaSign(const XTensor * a, XTensor * b);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
-*/
-
-#ifndef __SIGN_H__
-#define __SIGN_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* set every entry to its sign value */
-void _Sign(const XTensor * a, XTensor * b);
-
-/* 
-set every entry to its sign value (do it on site)
-keep the result in the input tensor a and return nothing
-*/
-void _SignMe(XTensor * a);
-
-/* 
-set every entry to its sign value  (return an XTensor structure)
-make a new tensor to keep the result and return it
-*/
-XTensor Sign(const XTensor & a);
-
-/* set every entry to its sign value */
-void Sign(const XTensor & a, XTensor & b, bool requireLink = false);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SIGN_H__
--- a/source/tensor/core/arithmetic/Sub.cpp
+++ b/source/tensor/core/arithmetic/Sub.cpp
@@ -128,6 +128,19 @@ void _SubMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /*
+tensor subtraction a = a - b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SubMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sub(&a, &b, &a, beta);
+}
+  
+/* 
 return a dimension if the subtraction is performed as SubDim (in more details in SubDim.h)
 >> a - a tensor
 >> b - another tensor for subtraction
@@ -203,9 +216,8 @@ tensor subtraction c = a - b * \beta
 >> b - another tensor
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -217,7 +229,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sub function */
        _Sub(&a, &b, &c, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUB);
            XLink::AddParamToHead(&c, beta);
@@ -227,7 +239,7 @@ void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SubDim function */
        _SubDim(&a, &b, &c, n, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sub.cu
+++ b/source/tensor/core/arithmetic/Sub.cu
@@ -47,6 +47,7 @@ void KernelSUB(T * a, T * b, T * c, int size, T beta)
        c[i] = a[i] - b[i] * beta;
 }

+
 /*
 tensor subtraction c = a - b * \beta (cuda version)
 >> a - a tensor
@@ -79,7 +80,7 @@ void _CudaSub(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
            GDevs.GetCudaThread(a->devID, a->unitNum, gridSize, blockSize);
            dim3 blocks(gridSize[0]);
            dim3 threads(blockSize[0]);
-            KernelSUB<<<blocks, threads>>>((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
+            KernelSUB << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, a->unitNum, beta);
        }
        else if (a->dataType == X_FLOAT16 &&
                 b->dataType == X_FLOAT16 &&

--- a/source/tensor/core/arithmetic/Sub.h
+++ b/source/tensor/core/arithmetic/Sub.h
@@ -35,6 +35,7 @@ tensor subtraction a = a - b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SubMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SubMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor subtraction c = a - b * \beta
@@ -43,7 +44,7 @@ make a new tensor c to keep the result and return it
 XTensor Sub(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor subtraction c = a - b * \beta */
-void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sub(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SubDim.cpp
+++ b/source/tensor/core/arithmetic/SubDim.cpp
@@ -183,9 +183,8 @@ i.e., a is subtracted with b by broadcasting
 >> c - where we put a-b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -194,7 +193,7 @@ void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _Sub function */
    _SubDim(&a, &b, &c, n, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUBDIM);
        XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/SubDim.cu
+++ b/source/tensor/core/arithmetic/SubDim.cu
--- a/source/tensor/core/arithmetic/SubDim.h
+++ b/source/tensor/core/arithmetic/SubDim.h
@@ -40,7 +40,7 @@ XTensor SubDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.

 /* tensor subtraction c = a - b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is subtracted with b by broadcasting*/
-void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SubDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -133,6 +133,19 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }

 /*
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
+>> a - a tensor
+>> b - another tensor
+>> beta - the scaling factor
+*/
+void SumMe(XTensor& a, const XTensor& b, DTYPE beta)
+{
+    _Sum(&a, &b, &a, beta);
+}
+
+/* 
 return a dimension if the sum is performed as SumDim (in more details in SumDim.h)
 >> a - a tensor
 >> b - another tensor for sum
@@ -207,9 +220,8 @@ tensor summation c = a + b * \beta
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -221,7 +233,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _Sum function */
        _Sum(&a, &b, &c, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUM);
            XLink::AddParamToHead(&c, beta);
@@ -231,7 +243,7 @@ void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requir
        /* call _SumDim function */
        _SumDim(&a, &b, &c, n, beta);

-        if (requireLink) {
+        if (c.enableGrad) {
            /* tensor connections */
            XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
            XLink::AddParamToHeadInt(&c, n);

--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -48,15 +48,6 @@ void KernelADD(T * a, T * b, T * c, int size, T beta)

 }

-__global__
-void KernelADDInt(int * a, int * b, int * c, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i < size)
-        c[i] = a[i] + b[i] * (int)beta;
-
-}

 /*
 tensor summation c = a + b * \beta (cuda version)

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -34,6 +34,7 @@ tensor summation a = a + b * \beta
 keep the result in the input tensor a and return nothing
 */
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
+void SumMe(XTensor & a, const XTensor & b, DTYPE beta = (DTYPE)1.0);
    
 /*
 tensor summation c = a + b * \beta
@@ -42,7 +43,7 @@ make a new tensor c to keep the result and return it
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor summation c = a + b * \beta */
-void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void Sum(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "SumByColumnTV.h"
-#include "SumByColumnTV.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-sum of a tensor and a vector (column vector) in a column by column manner
-
-for each column a_col (in a block), we have
-c_col = a_col + b * \beta
-where b is a vector.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-
-    int rowNum = a->dimSize[0];
-    int colNum = a->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
-    int blockSize = colNum * rowNum;
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSumByColumnTV(a, b, c, beta);
-#endif
-    }
-    else {
-        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "TODO!");
-
-            if (a->dataType == DEFAULT_DTYPE &&
-                b->dataType == DEFAULT_DTYPE &&
-                c->dataType == DEFAULT_DTYPE)
-            {
-                for (int k = 0; k < blockNum; k++) {
-                    for (int i = 0; i < rowNum; i++) {
-                        DTYPE * ap = (DTYPE*)a->data + k * blockSize + i * colNum;
-                        DTYPE * bp = (DTYPE*)b->data;
-                        DTYPE * cp = (DTYPE*)c->data + k * blockSize + i * colNum;
-                        DTYPE v = bp[i];
-                        for (int j = 0; j < colNum; j++)
-                            cp[j] = ap[j] + v * beta;
-                    }
-                }
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "SumByColumnTV.h"
-#include "SumByColumnTV.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-summation of a tensor and a vector (column vector)
-c_col = a_col  + b * \beta
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a
->> colNum - column number (of a block)
->> blockSize - size of a block
->> size - size of the entire data array
->> beta - the scaling factor
-*/
-__global__
-void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
-{
-    int i = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (i >= size)
-        return;
-
-    int offset = i % blockSize;
-    int row = offset / colNum;
-
-    c[i] = a[i] + b[row] * beta;
-}
-
-/*
-summation of a tensor and a vector (column vector)
-for each column a_col (in a block), we have
-c_col = a_col + b * \beta
-where b is a vector.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-                  c->dataType == DEFAULT_DTYPE), "TODO");
-
-    int rowNum = a->dimSize[0];
-    int colNum = a->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < a->order; i++)
-        blockNum *= a->dimSizeRDI[i];
-
-    int cudaGridSize[3];
-    int cudaBlockSize[3];
-
-    GDevs.GetCudaThread(c->devID, a->unitNum, cudaGridSize, cudaBlockSize);
-
-    int devIDBackup;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    KernelADDByColumnTV << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
-                          ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum * colNum, a->unitNum, beta);
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __REDUCEMAX_CUH__
-#define __REDUCEMAX_CUH__
-
-#include "../reduce/ReduceMax.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* summation of a tensor and a vector (column vector) */
-void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __REDUCEMAX_CUH__
-
--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNTV_H__
-#define __SUMBYCOLUMNTV_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* sum of a tensor and a (column) vector */
-void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNTV_H__
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XTensor.h"
-#include "SumByColumnVT.h"
-#include "SumByColumnVT.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/*
-sum of a vector (column vector) and a tensor in a column by column manner
-
-for each column b_col, we have
-c = a + \sum{col} b_col * \beta
-where c and a are vectors, and b_col is a column in b.
-
->> a - a tensor
->> b - a vector with the same column size with a
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-
-    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-#ifdef USE_CUDA
-        _CudaSumByColumnVT(a, b, c, beta);
-#endif
-    }
-    else {
-        int rowNum = b->dimSize[0];
-        int colNum = b->dimSize[1];
-        int blockNum = 1;
-        for (int i = 2; i < b->order; i++)
-            blockNum *= b->dimSizeRDI[i];
-        int blockSize = colNum * rowNum;
-
-        if (!a->isSparse && !b->isSparse) {
-            CheckNTErrors(!c->isSparse, "TODO!");
-
-            if (a->dataType == DEFAULT_DTYPE &&
-                b->dataType == DEFAULT_DTYPE &&
-                c->dataType == DEFAULT_DTYPE)
-            {
-                for (int k = 0; k < blockNum; k++) {
-                    for (int i = 0; i < rowNum; i++) {
-                        DTYPE * ap = (DTYPE*)a->data;
-                        DTYPE * bp = (DTYPE*)b->data + k * blockSize + i * colNum;
-                        DTYPE * cp = (DTYPE*)c->data;
-                        DTYPE sum = 0;
-                        for (int j = 0; j < colNum; j++)
-                            sum += bp[j];
-                        cp[i] = ap[i] + sum * beta;
-                    }
-                }
-            }
-            else {
-                // TODO!!
-                ShowNTErrors("TODO!");
-            }
-        }
-        else {
-            // TODO!!
-            ShowNTErrors("TODO!");
-        }
-    }
-}
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#include "../../XDevice.h"
-#include "../../XTensor.h"
-#include "SumByColumnVT.h"
-#include "SumByColumnVT.cuh"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/*
-summation of a vector (column vector) and a tensor
-c = a + \sum{col} b_col * \beta
->> a - a vector with the same column size with b
->> b - a tensor
->> c - where we put a+b. we save it in a
->> colNum - column number (of a block)
->> blockSize - size of a block
->> size - size of the entire data array
->> beta - the scaling factor
-*/
-__global__
-void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
-{
-    int row = blockDim.x * blockIdx.x + threadIdx.x;
-
-    if (row >= rowNum)
-        return;
-
-    DTYPE sum = 0;
-    for (int k = 0; k < blockNum; k++) {
-        DTYPE * bp = b + (rowNum * k + row) * colNum;
-        if (colNum % 4 == 0) {
-            for (int i = 0; i < colNum; i += 4)
-                sum += bp[i] + bp[i + 1] + bp[i + 2] + bp[i + 3];
-        }
-        else if (colNum % 2 == 0) {
-            for (int i = 0; i < colNum; i += 2)
-                sum += bp[i] + bp[i + 1];
-        }
-        else {
-            for (int i = 0; i < colNum; i++)
-                sum += bp[i];
-        }
-        __syncthreads();
-    }
-
-    c[row] = a[row] + beta * sum;
-}
-
-/*
-summation of a vector (column vector) and a tensor
-
-for each column b_col, we have
-c = a + \sum{col} b_col * \beta
-where c and a are vectors, and b_col is a column in b.
-
->> a - a vector with the same column size with b
->> b - a tensor
->> c - where we put a+b. we save it in a if c is NULL
->> beta - the scaling factor
-*/
-void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
-{
-    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
-    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-                  "Illegal input vector size!");
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-                  c->dataType == DEFAULT_DTYPE), "TODO");
-
-    int rowNum = b->dimSize[0];
-    int colNum = b->dimSize[1];
-    int blockNum = 1;
-    for (int i = 2; i < b->order; i++)
-        blockNum *= b->dimSizeRDI[i];
-
-    int cudaGridSize[3];
-    int cudaBlockSize[3];
-
-    GDevs.GetCudaThread(c->devID, a->dimSizeRDI[1], cudaGridSize, cudaBlockSize);
-
-    int devIDBackup = 0;
-    ProtectCudaDev(a->devID, devIDBackup);
-
-    KernelADDByColumnVT << <dim3(cudaGridSize[0]), dim3(cudaBlockSize[0]) >> >
-                         ((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data, colNum, rowNum, blockNum, beta);
-
-    BacktoCudaDev(a->devID, devIDBackup);
-}
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNVT_CUH__
-#define __SUMBYCOLUMNVT_CUH__
-
-#include "SumByColumnVT.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-#ifdef USE_CUDA
-
-/* summation of a vector (column vector) and a tensor */
-void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-
-#endif // USE_CUDA
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNVT_CUH__
-
--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
-/* NiuTrans.Tensor - an open-source tensor library
-* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
-* All rights reserved.
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*   http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*/
-
-/*
-* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
-*/
-
-#ifndef __SUMBYCOLUMNVT_H__
-#define __SUMBYCOLUMNVT_H__
-
-#include "../../XTensor.h"
-
-namespace nts { // namespace nts(NiuTrans.Tensor)
-
-/* sum of a (column) vector and a tensor */
-void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
-
-} // namespace nts(NiuTrans.Tensor)
-
-#endif // __SUMBYCOLUMNVT_H__
--- a/source/tensor/core/arithmetic/SumDim.cpp
+++ b/source/tensor/core/arithmetic/SumDim.cpp
@@ -200,9 +200,8 @@ i.e., a is summed with b by broadcasting
 >> c - where we put a+b*\beta. we save it in a if c is NULL
 >> n - the dimension index
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, bool requireLink)
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -211,7 +210,7 @@ void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta, b
    /* call _SumDim function */
    _SumDim(&a, &b, &c, n, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMDIM);
        XLink::AddParamToHeadInt(&c, n);
@@ -368,9 +367,8 @@ c = a + b * \beta
 >> b - another tensor that would be broadcasted
 >> c - the resulting tensor
 >> beta - the scaling factor
->> requireLink - if add operation to network
 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bool requireLink)
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta)
 {
    if (!c.isInit || !XTensor::IsSameShaped(&a, &c)) {
        InitTensor(&c, &a);
@@ -379,7 +377,7 @@ void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta, bo
    /* call _SumBroadcast function */
    _SumBroadcast(&a, &b, &c, beta);

-    if (requireLink) {
+    if (c.enableGrad) {
        /* tensor connections */
        XLink::MakeLink(&a, &b, &c, MATH_SUMBROADCAST);
        XLink::AddParamToHead(&c, beta);

--- a/source/tensor/core/arithmetic/SumDim.cu
+++ b/source/tensor/core/arithmetic/SumDim.cu
@@ -88,17 +88,17 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
    int col = colIndex % colNum;
    int block = colIndex / colNum;

-    if(row >= rowNum || block >= blockNum)
+    if (row >= rowNum || block >= blockNum)
        return;

-    if(threadIdx.x == 0)
+    if (threadIdx.x == 0)
        bv[threadIdx.y] = b[row];

    __syncthreads();

    int offset = block * blockSize + row * colNum + col;

-    if(betaFired)
+    if (betaFired)
        c[offset] = a[offset] + bv[threadIdx.y] * beta;
    else
        c[offset] = a[offset] + bv[threadIdx.y];

--- a/source/tensor/core/arithmetic/SumDim.h
+++ b/source/tensor/core/arithmetic/SumDim.h
@@ -44,7 +44,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta = (DTYPE)1.

 /* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a, 
   i.e., a is summed with b by broadcasting */
-void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumDim(const XTensor &a, const XTensor &b, XTensor &c, int n, DTYPE beta = (DTYPE)1.0);

 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
 void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
@@ -54,7 +54,7 @@ void _SumBroadcast(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta
 XTensor SumBroadcast(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 /* tensor broadcast summation c = a + b * \beta where some of dimensions of b can be of size 1 */
-void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0, bool requireLink = false);
+void SumBroadcast(const XTensor &a, const XTensor &b, XTensor &c, DTYPE beta = (DTYPE)1.0);
    
 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
--- a/source/tensor/core/math/Binary.cpp
+++ b/source/tensor/core/math/Binary.cpp
--- a/source/tensor/core/math/Binary.cu
+++ b/source/tensor/core/math/Binary.cu
--- a/source/tensor/core/math/Binary.cuh
+++ b/source/tensor/core/math/Binary.cuh
--- a/source/tensor/core/math/Binary.h
+++ b/source/tensor/core/math/Binary.h
--- a/source/tensor/core/math/Clip.cpp
+++ b/source/tensor/core/math/Clip.cpp
--- a/source/tensor/core/math/Clip.cuh
+++ b/source/tensor/core/math/Clip.cuh
--- a/source/tensor/core/math/Clip.h
+++ b/source/tensor/core/math/Clip.h
--- a/source/tensor/core/math/Compare.cpp
+++ b/source/tensor/core/math/Compare.cpp
--- a/source/tensor/core/math/Compare.h
+++ b/source/tensor/core/math/Compare.h
--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
--- a/source/tensor/core/math/ScaleAndShift.cu
+++ b/source/tensor/core/math/ScaleAndShift.cu
--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
--- a/source/tensor/core/math/Unary.cpp
+++ b/source/tensor/core/math/Unary.cpp
--- a/source/tensor/core/math/Unary.cu
+++ b/source/tensor/core/math/Unary.cu
--- a/source/tensor/core/math/Unary.cuh
+++ b/source/tensor/core/math/Unary.cuh
--- a/source/tensor/core/math/Unary.h
+++ b/source/tensor/core/math/Unary.h
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
--- a/source/tensor/core/movement/Gather.cpp
+++ b/source/tensor/core/movement/Gather.cpp
--- a/source/tensor/core/movement/Gather.cu
+++ b/source/tensor/core/movement/Gather.cu
--- a/source/tensor/core/movement/Gather.h
+++ b/source/tensor/core/movement/Gather.h
--- a/source/tensor/core/movement/Spread.cpp
+++ b/source/tensor/core/movement/Spread.cpp
--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Reshape.cpp
+++ b/source/tensor/core/shape/Reshape.cpp
--- a/source/tensor/core/shape/Reshape.h
+++ b/source/tensor/core/shape/Reshape.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Squeeze.cpp
+++ b/source/tensor/core/shape/Squeeze.cpp
--- a/source/tensor/core/shape/Squeeze.h
+++ b/source/tensor/core/shape/Squeeze.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/XMatrixSegment.cpp
+++ b/source/tensor/core/utilities/XMatrixSegment.cpp
--- a/source/tensor/function/FHeader.h
+++ b/source/tensor/function/FHeader.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/loss/CrossEntropy.cpp
+++ b/source/tensor/loss/CrossEntropy.cpp
--- a/source/tensor/loss/CrossEntropy.cu
+++ b/source/tensor/loss/CrossEntropy.cu
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TClip.cpp
+++ b/source/tensor/test/TClip.cpp
--- a/source/tensor/test/TCompare.cpp
+++ b/source/tensor/test/TCompare.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TCos.cpp
+++ b/source/tensor/test/TCos.cpp
--- a/source/tensor/test/TDiv.cpp
+++ b/source/tensor/test/TDiv.cpp
--- a/source/tensor/test/TDivDim.cpp
+++ b/source/tensor/test/TDivDim.cpp
--- a/source/tensor/test/TExp.cpp
+++ b/source/tensor/test/TExp.cpp
--- a/source/tensor/test/TGather.cpp
+++ b/source/tensor/test/TGather.cpp
--- a/source/tensor/test/TGather.h
+++ b/source/tensor/test/TGather.h
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul.h
+++ b/source/tensor/test/TMatrixMul.h
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TMultiplyDim.cpp
+++ b/source/tensor/test/TMultiplyDim.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNegate.h
+++ b/source/tensor/test/TNegate.h
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TPower.h
+++ b/source/tensor/test/TPower.h
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TRound.cpp
+++ b/source/tensor/test/TRound.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSetData.cpp
+++ b/source/tensor/test/TSetData.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSin.cpp
+++ b/source/tensor/test/TSin.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSpread.cpp
+++ b/source/tensor/test/TSpread.cpp
--- a/source/tensor/test/TSub.cpp
+++ b/source/tensor/test/TSub.cpp
--- a/source/tensor/test/TSubDim.cpp
+++ b/source/tensor/test/TSubDim.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSum.h
+++ b/source/tensor/test/TSum.h
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnTV.h
+++ b/source/tensor/test/TSumByColumnTV.h
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TSumByColumnVT.h
+++ b/source/tensor/test/TSumByColumnVT.h
--- a/source/tensor/test/TSumDim.cpp
+++ b/source/tensor/test/TSumDim.cpp
--- a/source/tensor/test/TTan.cpp
+++ b/source/tensor/test/TTan.cpp
--- a/source/tensor/test/TTranspose.cpp
+++ b/source/tensor/test/TTranspose.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h