big change! 1. modify all interface 2. modify the test case 3. merge with latest code of xiao

b3a76184 · xuchen · 2ed5a029 · b3a76184 · b3a76184 · b3a76184
Commit b3a76184 authored Jul 19, 2018 by xuchen
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -21,12 +21,16 @@
 #include <stdio.h>
 #include "XNet.h"
+#include "../tensor/function/FHeader.h"
+#include "../tensor/core/CHeader.h"
+#include "../sample/fnnlm/FNNLM.h"
 //#define CRTDBG_MAP_ALLOC
 //#include <stdlib.h>
 //#include <crtdbg.h>
 using namespace nts;
+using namespace samplefnnlm;
 int main( int argc, const char ** argv )
@@ -34,15 +38,43 @@ int main( int argc, const char ** argv )
    if(argc > 1 && !strcmp(argv[1], "-test"))
        1;//Test();
+    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+        FNNLMMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Network! This is a library for building\n");
        fprintf(stderr, "neural networks in an easy way. \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
+        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
    }
    XNet net;
    XTensor a;
-    net.Backward(a);
+    XTensor b;
+    XTensor c;
+    InitTensor2D(&a, 2, 2);
+    InitTensor2D(&b, 2, 4);
+    InitTensor2D(&c, 2, 4);
+    a.SetZeroAll();
+    b.SetZeroAll();
+    c.SetZeroAll();
+    SetDataFixed(a, 0.1F);
+    a.Set2D(0.3F, 1, 0);
+    a.Set2D(0.4F, 1, 1);
+    b = Merge(a, a, 1);
+    c = HTanH(MMul(a, b));
+    a.Dump(stderr, "a:");
+    b.Dump(stderr, "b:");
+    c.Dump(stderr, "c:");
+    XLink::ShowNetwork(stderr, &c);
+    net.Backward(c);
+    net.Dump(stderr);
    //_CrtDumpMemoryLeaks();

--- a/source/network/XBackwardFunc.cpp
+++ b/source/network/XBackwardFunc.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for activation function
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ * Dingdang won 5 games in the GO training yesterday, hahaha ...
+ */
+#include "XNoder.h"
+#include "XBackwardFunc.h"
+#include "../tensor/XName.h"
+#include "../tensor/function/FHeader.h"
+namespace nts{
+/* compute dE/dx of a node */
+void XFuncGrad::MakeGrad(XTensor * node)
+{
+    XLink &income = node->income;
+    int operID = income.typeID;
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    CheckNTErrors(income.tailNum == 1, "Too many input tensors for the function!");
+    XTensor * input = income.tails[0];
+    XTensor * output = node;
+    XNoder::MakeGrad(input);
+    if(operID == FUNC_HARDTANH)
+        _HardTanHBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_IDENTITY)
+        _IdentityBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_LOGSOFTMAX){
+        int leadDim = income.GetParamInt(0);
+        _LogSoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+    }
+    else if(operID == FUNC_RECTIFY)
+        _RectifyBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_SIGMOID)
+        _SigmoidBackward(NULL, output, input, output->grad, input->grad, NOLOSS);
+    else if(operID == FUNC_SOFTMAX){
+        int leadDim = income.GetParamInt(0);
+        _SoftmaxBackward(NULL, output, input, output->grad, input->grad, leadDim, NOLOSS);
+    }
+    else{
+        ShowNTErrors("Wrong activation function type!");
+    }
+}
+/* indicates whether the node is for an activation function */
+bool XFuncGrad::IsFunc(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & FUNCTION_BASE) != 0;
+}
+}
--- a/source/network/XBackwardFunc.h
+++ b/source/network/XBackwardFunc.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for activation function
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ * Dingdang won 5 games in the GO training yesterday, hahaha ...
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDFUNC_H__
+#define __XBACKWARDFUNC_H__
+namespace nts{
+/* this class computes the gradient for activation functions given a node */
+class XFuncGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+    /* indicates whether the node is for an activation function */
+    static
+    bool IsFunc(XTensor * node);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ */
+#include "XBackwardLoss.h"
+#include "../tensor/XName.h"
+#include "../tensor/function/HardTanH.h"
+#include "../tensor/function/LogSoftmax.h"
+namespace nts{
+/* 
+compute dE/dx for a given function y = f(x) 
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> x - input of the function
+>> dedy - dE/dy
+>> dedx - dE/dx
+>> funcID - id of the function f
+>> params - parameters of the function
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                        XTensor * dedy, XTensor * dedx,
+                        int funcID, void * params,
+                        LOSS_FUNCTION_NAME lossName)
+{
+    CheckNTErrors(gold && y && x, "Empty input tensors!");
+    CheckNTErrors(dedx, "Empty gradient tensors!");
+    CheckNTErrors((funcID & FUNCTION_BASE) != 0, "Illegal function id");
+    if(funcID == FUNC_HARDTANH){
+        _HardTanHBackward(gold, y, x, dedy, dedx, lossName);
+    }
+    else if(funcID == FUNC_LOGSOFTMAX){
+        int leadDim = *(int*)params;
+        _LogSoftmaxBackward(gold, y, x, dedy, dedx, leadDim, lossName);
+    }
+    else{
+        ShowNTErrors("wrong function found when call the backward process!");
+    }
+}
+/* 
+compute dE/dy for variable y and error(loss) function E
+>> gold - gold standard to measure error (or loss)
+>> y - output of the function
+>> dedy - dE/dy
+>> lossName - name of the loss, e.g., cross entropy
+*/
+void XLossGrad::Compute(XTensor * gold, XTensor * y, 
+                        XTensor * dedy, 
+                        LOSS_FUNCTION_NAME lossName)
+{
+    _LossBackward(dedy, gold, y, lossName);
+}
+}
\ No newline at end of file
--- a/source/network/XBackwardLoss.h
+++ b/source/network/XBackwardLoss.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-17
+ * My students worked all night to prepare a submission to CWMT. Good luck
+ * to them!
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDLOSS_H__
+#define __XBACKWARDLOSS_H__
+namespace nts{
+/* this class computes the gradient (of a output node) 
+   with respect to the loss */
+class XLossGrad
+{
+public:
+    /* compute dE/dx for a given function y = f(x) */
+    void Compute(XTensor * gold, XTensor * y, XTensor * x, 
+                 XTensor * dedy, XTensor * dedx,
+                 int funcID, void * params,
+                 LOSS_FUNCTION_NAME lossName);
+    /* compute dE/dy for variable y and error(loss) function E */
+    void Compute(XTensor * gold, XTensor * y, 
+                 XTensor * dedy, 
+                 LOSS_FUNCTION_NAME lossName);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+#include "XNoder.h"
+#include "XBackwardMath.h"
+#include "../tensor/XName.h"
+#include "../tensor/core/CHeader.h"
+namespace nts{
+/* compute dE/dx of a node */
+void XMathGrad::MakeGrad(XTensor * node)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    XLink &income = node->income;
+    int operID = income.typeID;
+    if(operID == MATH_SUM)
+        GradSum(node);
+    else if(operID == MATH_MULTIPLY)
+        GradMultiply(node);
+    else if(operID == MATH_MATRIXMUL)
+        GradMatrixMul(node);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+/* indicates whether the node is for a math operation */
+bool XMathGrad::IsMathOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & MATH_BASE) != 0;
+}
+/* 
+gradient for sum
+for 
+c =  a + b * \beta
+we have
+dE/da = dE/dc 
+dE/db = dE/dc * \beta
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradSum(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for SUM!");
+    XTensor * a = income.tails[0];
+    XTensor * b = income.tails[1];
+    DTYPE beta = income.GetParam(0);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    _Sum(a->grad, node->grad, a->grad);
+    _Sum(b->grad, node->grad, b->grad, beta);
+}
+/* 
+gradient for multiply (dot production)
+for
+c =  a * b 
+we have
+dE/da = dE/dc * b
+dE/db = dE/dc * a 
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMultiply(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!");
+    _Multiply(node->grad, b, a->grad, 1.0F);
+    _Multiply(node->grad, a, b->grad, 1.0F);
+}
+/* 
+gradient for matrix multiply
+for c = matmul(a, b) * \alpha
+we have 
+dE/da = dE/dc * b^T * \alpha
+dE/db = a^T * dE/dc * \alpha
+>> node - the node (c) for backward computation
+*/
+void XMathGrad::GradMatrixMul(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 2, "Wrong input tensor number for MULTIPLY!");
+    CheckNTErrors(income.paramNum == 3, "Wrong parameter number for MULTIPLY!");
+    XTensor * a = income.tails[0]; 
+    XTensor * b = income.tails[1];
+    MATRIX_TRANS_TYPE transA = income.GetParamTrans(0);
+    MATRIX_TRANS_TYPE transB = income.GetParamTrans(1);
+    DTYPE alpha = income.GetParam(2);
+    XNoder::MakeGrad(a);
+    XNoder::MakeGrad(b);
+    XTensor * dedc = node->grad;
+    XTensor * deda = a->grad;
+    XTensor * dedb = b->grad;
+    /* c = a * b * \alpha */
+    if(transA == X_NOTRANS && transB == X_NOTRANS){
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b * \alpha */
+    else if(transA == X_TRANS && transB == X_NOTRANS){
+        /* dE/da = dE/dc * b^T * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_TRANS, deda, alpha, 1.0F);
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a * b^T * \alpha */
+    else if(transA == X_NOTRANS && transB == X_TRANS){
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        /* dE/db = a^T * dE/dc * \alpha */
+        _MatrixMul(a, X_TRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+    /* c = a^T * b^T * \alpha */
+    else if(transA == X_TRANS && transB == X_TRANS){
+        /* dE/da = dE/dc * b * \alpha */
+        _MatrixMul(dedc, X_NOTRANS, b, X_NOTRANS, deda, alpha, 1.0F);
+        /* dE/db = a * dE/dc * \alpha */
+        _MatrixMul(a, X_NOTRANS, dedc, X_NOTRANS, dedb, alpha, 1.0F);
+    }
+}
+}
--- a/source/network/XBackwardMath.h
+++ b/source/network/XBackwardMath.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+#include "../tensor/XTensor.h"
+#ifndef __XBACKWARDMATH_H__
+#define __XBACKWARDMATH_H__
+namespace nts{
+/* this class computes the gradient for math operations given a node */
+class XMathGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+    /* indicates whether the node is for a math operation */
+    static
+    bool IsMathOP(XTensor * node);
+private:
+    /* gradient for sum: c =  a + b * \beta */
+    static
+    void GradSum(XTensor * node);
+    /* gradient for multiply (dot production): c =  a * b */
+    static
+    void GradMultiply(XTensor * node);
+    /* gradient for matrix multiply: c = matmul(a, b) */
+    static
+    void GradMatrixMul(XTensor * node);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for math operations
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-19
+ * It was chilly when I came into the office this morning ...
+ * because i forgot to turn the air-condition off last night :(
+ */
+#include "XNoder.h"
+#include "XBackwardShape.h"
+#include "../tensor/XName.h"
+#include "../tensor/core/CHeader.h"
+namespace nts{
+/* compute dE/dx of a node */
+void XShapeGrad::MakeGrad(XTensor * node)
+{
+    CheckNTErrors(node->grad != NULL, "No gradient found!");
+    XLink &income = node->income;
+    int operID = income.typeID;
+    if(operID == SHAPE_MERGE)
+        GradMerge(node);
+    else if(operID == SHAPE_MERGE_LIST)
+        GradMergeList(node);
+    else if(operID == SHAPE_UNSQUEEZE)
+        GradUnsqueeze(node);
+    else{
+        ShowNTErrors("TODO!");
+    }
+}
+/* indicates whether the node is for a math operation */
+bool XShapeGrad::IsShapeOP(XTensor * node)
+{
+    XLink &income = node->income;
+    return (income.typeID & DATA_BASE) != 0;
+}
+/* 
+gradient for merge
+for 
+c = merge(a_0, a_1, ...)
+where a_i is the i-th block in a tensor a
+we have
+dE/da_0 = dE/dc_{split_0}
+dE/db_1 = dE/dc_{split_1}
+...
+i.e.,
+dE/da = split(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradMerge(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 0, "Wrong input tensor number for MERGE!");
+    XTensor * input = income.tails[0];
+    int whereToMerge = income.GetParamInt(0);
+    int leadDim = income.GetParamInt(1);
+    int blockSize = 1;
+    int blockNum = 1;
+    for(int i = 0; i < input->order; i++){
+        if(i < leadDim)
+            blockNum *= input->dimSize[i];
+    }
+    blockSize = input->GetDataSizeInChar() / blockNum;
+    XNoder::MakeGrad(input);
+    int * dims = new int[input->order];
+    for(int i = 0, j = 0; i < input->order; i++){
+        if(i >= leadDim){
+            dims[j++] = input->dimSize[i];
+        }
+    }
+    dims[0] = -dims[0];
+    XTensor gradInputSmall(input->order - leadDim, dims,
+                            input->dataType, input->denseRatio, 
+                            input->devID, input->mem);
+    dims[whereToMerge - leadDim] *= dims[0];
+    XTensor gradNodeSmall(node->order - leadDim, dims,
+                            node->dataType, node->denseRatio, 
+                            node->devID, node->mem);
+    /* we can simply split the gradient tensor 
+       if the input is used in merging only */
+    if(input->outgo.tailNum == 1){
+        for(int i = 0; i < blockNum; i++){
+            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim, input->dimSize[leadDim]);
+        }
+    }
+    /* a more complicated case is that the input tensor is used for 
+       other operations somewhere else. So we have to do gradient 
+       accumulation after spliting, i.e., we need an additional 
+       SUM operation */
+    else{
+        XTensor gradInputSmallBuf(&gradInputSmall);
+        for(int i = 0; i < blockNum; i++){
+            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
+            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
+            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim, input->dimSize[leadDim]);
+            _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
+        }
+    }
+    gradNodeSmall.data = NULL;
+    gradInputSmall.data = NULL;
+    delete[] dims;
+}
+/* 
+gradient for merging a list of tensors
+for 
+c = merge(list(a, b, ...)) 
+where a, b ... are of the same size
+we have
+dE/da = dE/dc_{split_0}
+dE/db = dE/dc_{split_1}
+i.e.,
+list(dE/da, dE/db, ...) = split(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradMergeList(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum > 0, "Wrong input tensor number for MERGE!");
+    XTensor * last = NULL;
+    XList smalls(income.tailNum);
+    XList smallsGrad(income.tailNum);
+    bool mergeOnly = true;
+    for(int i = 0; i < income.tailNum; i++){
+        XTensor * tail = income.tails[i];
+        XNoder::MakeGrad(tail);
+        smalls.Add(tail);
+        smallsGrad.Add(tail->grad);
+        if(i > 1){
+            CheckNTErrors(XTensor::IsIdentical(last, tail), 
+                         "Input tensors must be of the same size!");
+        }
+        if(tail->outgo.tailNum  > 1)
+            mergeOnly = false;
+        last = tail;
+    }
+    int whereToMerge = income.GetParamInt(0);
+    /* we can simply split the gradient tensor into the input tensors 
+       if the inputs are used in merging only */
+    if(mergeOnly)
+        _Split(node->grad, &smallsGrad, whereToMerge, smalls.count);
+    /* a more complicated case is that the input tensors are used for 
+       other operations somewhere else. So we have to do gradient 
+       accumulation after spliting, i.e., we need an additional 
+       SUM operation */
+    else{
+        int * dims = new int[last->order + 1];
+        dims[0] = smalls.count;
+        for(int i = 0; i < last->order; i++)
+            dims[i + 1] = last->dimSize[i];
+        XTensor gradSplit(last->order + 1, dims, 
+                          last->dataType, last->denseRatio, 
+                          last->devID, last->mem);
+        _Split(node->grad, &gradSplit, whereToMerge, smalls.count);
+        memcpy(dims, last->dimSize, sizeof(int) * last->order);
+        dims[0] = -dims[0];
+        XTensor gradSmall(last->order, dims,
+                          last->dataType, last->denseRatio, 
+                          last->devID, last->mem);
+        /* gradient accumulation for each split */
+        for(int i = 0; i < smalls.count; i++){
+            XTensor * inputGrad = (XTensor*)smallsGrad.Get(i);
+            gradSmall.data = (char*)gradSplit.data + i * last->unitNum * last->unitSize;
+            _Sum(inputGrad, &gradSmall, inputGrad);
+        }
+        gradSmall.data = NULL;
+        delete[] dims;
+    }
+}
+/* 
+gradient for unsqueezing a tensor
+for
+c = unsqueeze(a) 
+we have
+dE/da = reduecesum(dE/dc)
+>> node - the node (c) for backward computation
+*/
+void XShapeGrad::GradUnsqueeze(XTensor * node)
+{
+    XLink &income = node->income;
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for UNSQUEEZE!");
+    XTensor * output = node;
+    XTensor * input = income.tails[0];
+    XNoder::MakeGrad(input);
+    int dim = income.GetParamInt(0);
+    int dSize = income.GetParamInt(1);
+    CheckNTErrors(dSize == output->GetDim(dim), "Wrong dim size for UNSQUEEZE!");
+    CheckNTErrors(output->unitNum = input->unitNum * dSize, "Wrong tensor size!");
+    _ReduceSum(output->grad, input->grad, dim);
+}
+}
\ No newline at end of file
--- a/source/network/XBackwardShape.h
+++ b/source/network/XBackwardShape.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * backward computation for shaping and data movement
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+#include "../tensor/XTensor.h"
+#include "../tensor/function/FHeader.h"
+#ifndef __XBACKWARDSHAPE_H__
+#define __XBACKWARDSHAPE_H__
+namespace nts{
+/* this class computes the gradient for tensor shaping and movement given a node */
+class XShapeGrad
+{
+public:
+    /* compute dE/dx of a node */
+    static
+    void MakeGrad(XTensor * node);
+    /* indicates whether the node is for a shaping operation */
+    static
+    bool IsShapeOP(XTensor * node);
+private:
+    /* gradient for merge: c = merge(a, b, ...) */
+    static
+    void GradMerge(XTensor * node);
+    /* gradient for merging a list of tensors : c = merge(list(a, b, ...)) */
+    static
+    void GradMergeList(XTensor * node);
+    /* gradient for unsqueezing a tensor : c = unsqueeze(a) */
+    static
+    void GradUnsqueeze(XTensor * node);
+};
+}
+#endif
\ No newline at end of file
--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -20,6 +20,12 @@
 */
 #include "XNet.h"
+#include "XNoder.h"
+#include "XBackwardLoss.h"
+#include "XBackwardMath.h"
+#include "XBackwardFunc.h"
+#include "XBackwardShape.h"
+#include "../tensor/XName.h"
 namespace nts{
@@ -78,6 +84,22 @@ void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
 }
 /* 
+backward propagation to obtain gradient
+>> root - root node (output) of the network
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+    XList golds(1);
+    golds.Add(NULL);
+    Backward(roots, golds, loss);
+}
+/* 
 backward propagation to obtain gradient wrt. the loss/error function
 with a number of root nodes 
 >> root - a list of root nodes (output) of the network
@@ -87,6 +109,85 @@ with a number of root nodes
 void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
 {
    Traverse(roots);
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        node->visitMark = NODE_UNFINISHED;
+    }
+    XLossGrad lossGrad;
+    /* we start with the gradient with respect to the loss for output layers */
+    for(int i = 0; i < roots.count; i++){
+        XTensor * root = (XTensor*)roots.Get(i);
+        XTensor * gold = (XTensor*)golds.Get(i);
+        XLink &income = root->income;
+        int funcID = income.typeID;
+        void * params = income.params;
+        /* we compute dE/dx if the output is generated by an activation function y = f(x).
+           Note that we do not need to obtain dE/dy here because it is no use in the 
+           folloing process of back-propagation */
+        if(gold != NULL && income.tailNum == 1 && (funcID & FUNCTION_BASE)){
+            XTensor * x = income.tails[0];
+            XNoder::MakeGrad(x);
+            lossGrad.Compute(gold, root, x, NULL, x->grad, funcID, params, loss);
+            root->visitMark = NODE_FINISHED;
+        }
+        /* we compuate dE/dy (y is the output) if no predefined activation function is used */
+        else{
+            XNoder::MakeGrad(root);
+            lossGrad.Compute(gold, root, root->grad, loss);
+        }
+    }
+    /* back-propagation from output to input */
+    for(int i = nodes.count - 1; i >= 0; i--){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(node->visitMark == NODE_FINISHED)
+            continue;
+        BackwardNode(node);
+    }
+}
+/* 
+backward propagation to obtain gradient
+with a number of root nodes 
+>> root - a list of root nodes (output) of the network
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, LOSS_FUNCTION_NAME loss)
+{
+    XList golds(roots.count);
+    for(int i = 0; i < roots.count; i++)
+        golds.Add(NULL);
+    Backward(roots, golds, loss);
+}
+/* 
+backward computation for a given node 
+>> node - the node keeps the result of an operation (e.g., activation function)
+*/
+void XNet::BackwardNode(XTensor * node)
+{
+    if(node == NULL || node->visitMark == NODE_FINISHED)
+        return;
+    if(!XNoder::IsLeaf(node)){
+        if(XMathGrad::IsMathOP(node))
+            XMathGrad::MakeGrad(node);
+        else if(XFuncGrad::IsFunc(node))
+            XFuncGrad::MakeGrad(node);
+        else if(XShapeGrad::IsShapeOP(node))
+            XShapeGrad::MakeGrad(node);
+        else{
+            ShowNTErrors("Wrong node type!");
+        }
+    }
+    node->visitMark = NODE_FINISHED;
 }
 /* 
@@ -115,6 +216,15 @@ void XNet::Traverse(XList &roots)
    for (int i = 0; i < roots.count; i++)
        TarjanVisit((XTensor*)roots.Get(i), nodes, id);
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node = (XTensor*)nodes.Get(i);
+        if(XNoder::IsRoot(node))
+            outputs.Add(node);
+        if(XNoder::IsLeaf(node))
+            inputs.Add(node);
+        if(XNoder::IsGrad(node))
+            gradNodes.Add(node);
+    }
 }
 /* 
@@ -145,4 +255,22 @@ void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
    }
 }
+/* 
+dump network information 
+>> file - the file for dumping
+*/
+void XNet::Dump(FILE * file)
+{
+    for(int i = 0; i < nodes.count; i++){
+        XTensor * node =  (XTensor*)nodes.Get(i);
+        fprintf(file, "node %d: %d\n", i, node->id);
+        node->Dump(file, "tensor: ");
+        if(node->grad != NULL)
+            node->grad->Dump(file, "grad: ");
+        else
+            fprintf(file, "no gradient!\n");
+        fprintf(file, "\n");
+    }
+}
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -57,11 +57,21 @@ struct XNet
    void Clear();
    /* backward propagation to obtain gradient wrt. the loss/error function */
-    void Backward(XTensor &root, XTensor &gold = NULLTensor, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward propagation to obtain gradient */
+    void Backward(XTensor &root, LOSS_FUNCTION_NAME loss = NOLOSS);
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
-    void Backward(XList &roots, XList &golds = NULLList, LOSS_FUNCTION_NAME loss = NOLOSS);
+    void Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward propagation to obtain gradient
+       with a number of root nodes */
+    void Backward(XList &roots, LOSS_FUNCTION_NAME loss = NOLOSS);
+    /* backward computation for a given node */
+    void BackwardNode(XTensor * node);
    /* traverse the net and find the topological order by 
       depth-first search (Tarjan's algorithm) */
@@ -73,6 +83,9 @@ struct XNet
    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
+    /* dump network information */
+    void Dump(FILE * file);
 };
 /* we make a unique id for every tensor */

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+#include "XNoder.h"
+namespace nts{
+/* make gradient tensor for a node */
+void XNoder::MakeGrad(XTensor * node)
+{
+    if(node == NULL)
+        return;
+    if(!XTensor::IsIdentical(node, node->grad)){
+        delete node->grad;
+        node->grad = NewTensor(node);
+        node->grad->SetZeroAll();
+    }
+}
+/* the node is a leaf node (intput) or not */
+bool XNoder::IsLeaf(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->income.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+/* the node is a root node (output) or not */
+bool XNoder::IsRoot(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->outgo.tailNum == 0)
+        return true;
+    else
+        return false;
+}
+/* the node keeps the gradinent or not */
+bool XNoder::IsGrad(XTensor * node)
+{
+    if(node == NULL)
+        return false;
+    if(node->isGrad)
+        return true;
+    else
+        return false;
+}
+}
\ No newline at end of file
--- a/source/network/XNoder.h
+++ b/source/network/XNoder.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * low-level utilities
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-18
+ */
+#include "../tensor/XTensor.h"
+#ifndef __XNODER_H__
+#define __XNODER_H__
+namespace nts{
+#define NODE_UNFINISHED 0
+#define NODE_DOING      1
+#define NODE_FINISHED   2
+/* node management */
+class XNoder
+{
+public:
+    /* make gradient tensor for a node */
+    static
+    void MakeGrad(XTensor * node);
+    /* the node is a leaf node (intput) or not */
+    static
+    bool IsLeaf(XTensor * node);
+    /* the node is a root node (output) or not */
+    static
+    bool IsRoot(XTensor * node);
+    /* the node keeps the gradinent or not */
+    static
+    bool IsGrad(XTensor * node);
+};
+}
+#endif
\ No newline at end of file
--- a/source/tensor/sample/fnnlm/FNNLM.cpp
+++ b/source/tensor/sample/fnnlm/FNNLM.cpp
@@ -27,10 +27,11 @@
 #include <math.h>
 #include "FNNLM.h"
-#include "../../XGlobal.h"
+#include "../../tensor/XGlobal.h"
-#include "../../XUtility.h"
+#include "../../tensor/XUtility.h"
-#include "../../XDevice.h"
+#include "../../tensor/XDevice.h"
-#include "../../function/FHeader.h"
+#include "../../tensor/function/FHeader.h"
+#include "../../network/XNet.h"
 namespace samplefnnlm
 {
@@ -50,6 +51,7 @@ float minmax = 0.08F;                 // range [-p,p] for parameter initializati
 int sentBatch = 0;                    // batch size at the sentence level
 int wordBatch = 1;                    // batch size at the word level
 bool shuffled = false;                // shuffled the training data file or not
+bool autoDiff = false;                // indicator of automatic differentiation
 void LoadArgs(int argc, const char ** argv, FNNModel &model);
 void Init(FNNModel &model);
@@ -59,7 +61,7 @@ void Clear(FNNModel &model);
 void InitModelTensor1D(XTensor &tensor, int num, FNNModel &model);
 void InitModelTensor2D(XTensor &tensor, int rowNum, int colNum, FNNModel &model);
 void Train(const char * train, bool isShuffled, FNNModel &model);
-void Update(FNNModel &model, FNNModel &grad, float epsilon);
+void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad);
 float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs = NULL);
 void Dump(const char * fn, FNNModel &model);
 void Read(const char * fn, FNNModel &model);
@@ -71,6 +73,8 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
 void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
 void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
              FNNModel &model, FNNModel &grad, FNNNet &net);
+void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
+             FNNModel &model, XNet &net);
 /* 
 entry of the program 
@@ -98,6 +102,7 @@ arguments:
 -devid D: the id of the device used
           -1: GPU, >=0: GPUs
 -mempool: use memory pools for memory management
+ -autodiff: use automatic differentiation for training
 where S=string, D=integer and F=float.
 All words in the training and test data files
@@ -182,6 +187,8 @@ void LoadArgs(int argc, const char ** argv, FNNModel &model)
            wordBatch = atoi(argv[i + 1]);
        if(!strcmp(argv[i], "-shuffle"))
            shuffled = true;
+        if(!strcmp(argv[i], "-autodiff"))
+            autoDiff = true;
        if(!strcmp(argv[i], "-dev") && i + 1 < argc)
            model.devID = atoi(argv[i + 1]);
    }
@@ -350,6 +357,9 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
    FNNModel grad;
    Copy(grad, model);
+    /* XNet for automatic differentiation */
+    XNet autoDiffer;
    double startT = GetClockSec();
    /* iterate for a number of epochs */
@@ -380,9 +390,6 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* the gold standard */
            XTensor gold;
-            /* prepare an empty network for building the fnn */
-            FNNNet net;
            /* make the input tensor for position i */
            for(int i = 0; i < model.n - 1; i++)
                MakeWordBatch(inputs[i], ngrams, ngramNum, i, model.vSize, model.devID, model.mem);
@@ -390,17 +397,29 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            /* make the gold tensor */
            MakeWordBatch(gold, ngrams, ngramNum, model.n - 1, model.vSize, model.devID, model.mem);
-            /* gradident = 0 */
+            if(!autoDiff){
-            Clear(grad);
+                /* prepare an empty network for building the fnn */
+                FNNNet net;
+                /* gradident = 0 */
+                Clear(grad);
-            /* forward computation */
+                /* forward computation */
-            Forward(inputs, output, model, net);
+                Forward(inputs, output, model, net);
-            /* backward computation to obtain gradients */
+                /* backward computation to obtain gradients */
-            Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
+                Backward(inputs, output, gold, CROSSENTROPY, model, grad, net);
+                /* update model parameters */
+                Update(model, grad, learningRate, false);
+            }
+            else{
+                /* forward + backward process */
+                FBInOne(inputs, output, gold, CROSSENTROPY, model, autoDiffer);
-            /* update model parameters */
+                /* update model parameters */
-            Update(model, grad, learningRate);
+                Update(model, grad, learningRate, true);
+            }
            /* get probabilities */
            float prob = GetProb(output, gold);
@@ -442,26 +461,45 @@ update the model parameters using the delta rule
 >> model - the model to update
 >> grad - gradients
 >> epsilon - learning rate
+>> isNodeGrad - indicates whether the gradient is associated with the node
 */
-void Update(FNNModel &model, FNNModel &grad, float epsilon)
+void Update(FNNModel &model, FNNModel &grad, float epsilon, bool isNodeGrad)
 {
    XList paraList(10);
    XList gradList(10);
    paraList.Add(&model.outputW);
-    gradList.Add(&grad.outputW);
    paraList.Add(&model.outputB);
-    gradList.Add(&grad.outputB);
    for (int i = 0; i < model.hDepth; i++) {
        paraList.Add(&model.hiddenW[i]);
-        gradList.Add(&grad.hiddenW[i]);
        paraList.Add(&model.hiddenB[i]);
-        gradList.Add(&grad.hiddenB[i]);
    }
    paraList.Add(&model.embeddingW);
-    gradList.Add(&grad.embeddingW);
+    if(!isNodeGrad){
+        gradList.Add(&grad.outputW);
+        gradList.Add(&grad.outputB);
+        for (int i = 0; i < model.hDepth; i++) {
+            gradList.Add(&grad.hiddenW[i]);
+            gradList.Add(&grad.hiddenB[i]);
+        }
+;
+        gradList.Add(&grad.embeddingW);
+    }
+    else{
+        paraList.Add(model.outputW.grad);
+        paraList.Add(&model.outputB.grad);
+        for (int i = 0; i < model.hDepth; i++) {
+            paraList.Add(&model.hiddenW[i].grad);
+            paraList.Add(&model.hiddenB[i].grad);
+        }
+        paraList.Add(&model.embeddingW.grad);
+    }
    for (int i = 0; i < paraList.count; i++) {
        XTensor * para = (XTensor*)paraList.GetItem(i);
@@ -773,7 +811,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    /* for y = softmax(s), we get dE/ds
        where E is the error function (define by loss) */
-    LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
+    _LogSoftmaxBackward(&gold, &y, &s, NULL, &deds, 1, loss);
    /* for s = x * w, we get 
       dE/w_{i,j} = dE/ds_j * ds/dw_{i,j} 
@@ -818,7 +856,7 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
        /* backpropagation through the activation fucntion: 
           dE/ds = dE/dh * dh/ds */
-        HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
+        _HardTanHBackward(NULL, &h, &s, &dedh, &deds, NOLOSS);
        /* gradient of the weight: dE/dw = x^T * dE/ds   */
        _MatrixMul(&x, X_TRANS, &deds, X_NOTRANS, &dedw);
@@ -863,6 +901,55 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
    }
 }
+/*
+forward + backward in one procedure
+>> inputs - input word representations
+>> output - output probability
+>> gold - gold standard
+>> loss - loss function name
+>> model - the fnn model
+*/
+void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, 
+             LOSS_FUNCTION_NAME loss, FNNModel &model, XNet &net)
+{
+    int batchSize = gold.GetDim(0);
+    int n = model.n;
+    int depth = model.hDepth;
+    XTensor words;
+    XTensor embeddingBig;
+    XTensor hidden;
+    XTensor b;
+    XList inputList(n - 1);
+    for(int i = 0; i < n - 1; i++)
+        inputList.Add(inputs + i);
+    /* represent n - 1 words in one tensor */
+    words = Merge(inputList, 0);
+    /* word embedding */
+    embeddingBig = MMul(words, model.embeddingW);
+    /* input of the first hidden layer */
+    hidden = Split(embeddingBig, 0, n - 1);
+    hidden = Merge(hidden, 2, 0);
+    /* hidden layers */
+    for(int i = 0; i < depth; i++){
+        b = Unsqueeze(model.hiddenB[i], 1, batchSize);
+        hidden = MMul(hidden, model.hiddenW) + b;
+    }
+    b = Unsqueeze(model.outputB, 1, batchSize);
+    /* output layer */
+    output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
+    /* automatic differentiation */
+    net.Backward(output);
+}
 /* 
 dump the model to the disk space
 >> fn - where to keep the model

--- a/source/tensor/sample/fnnlm/FNNLM.h
+++ b/source/tensor/sample/fnnlm/FNNLM.h
@@ -30,9 +30,9 @@
 #ifndef __FNNLM_H__
 #define __FNNLM_H__
-#include "../../XGlobal.h"
+#include "../../tensor/XGlobal.h"
-#include "../../XTensor.h"
+#include "../../tensor/XTensor.h"
-#include "../../core/CHeader.h"
+#include "../../tensor/core/CHeader.h"
 using namespace nts;

--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -28,7 +28,6 @@
 #include <time.h>
 #include "XTensor.h"
 #include "XDevice.h"
-#include "./sample/fnnlm/FNNLM.h"
 #include "./test/Test.h"
 //#define CRTDBG_MAP_ALLOC
@@ -36,7 +35,6 @@
 //#include <crtdbg.h> 
 using namespace nts;
-using namespace samplefnnlm;
 void SmallTest();
@@ -45,21 +43,17 @@ int main( int argc, const char ** argv )
    //_CrtSetBreakAlloc(123);
    /* a tiny test */
-    if(true)
+    SmallTest();
-        SmallTest();
    //_CrtDumpMemoryLeaks();
-    return 0;
+    //return 0;
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        FNNLMMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
-        fprintf(stderr, "Or run this program with \"-fnnlm\" for sample FNNLM!\n");
    }
    //_CrtDumpMemoryLeaks();

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -25,7 +25,7 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-int XLink::paramSize = 64;
+int XLink::paramSize = PARAM_UNTI_SIZE;
 /* constuctor */
 XLink::XLink()
@@ -114,6 +114,8 @@ void XLink::ClearOutgoing(XTensor * node)
    outgo.ClearTail();
    outgo.typeID = 0;
    outgo.type[0] = 0;
+    delete[] (char*)outgo.params;
+    outgo.params = NULL;
 }
 /*
@@ -152,6 +154,8 @@ void XLink::ClearIncoming(XTensor * node)
    income.ClearTail();
    income.typeID = 0;
    income.type[0] = 0;
+    delete[] (char*)income.params;
+    income.params = NULL;
 }
 /* 
@@ -210,7 +214,7 @@ add a parameter
 void XLink::AddParam(DTYPE param)
 {
    void * ps = params;
-    params = new char[paramNum + 1];
+    params = new char[(paramNum + 1) * paramSize];
    memcpy(params, ps, paramNum * paramSize);
    DTYPE * p = (DTYPE*)((char*)params + paramNum * paramSize);
    *p = param;
@@ -226,7 +230,7 @@ add a parameter
 void XLink::AddParam(void * param, int size)
 {
    void * ps = params;
-    params = new char[paramNum + 1];
+    params = new char[(paramNum + 1) * paramSize];
    memcpy(params, ps, paramNum * paramSize);
    char * p = (char*)params + paramNum * paramSize;
    memcpy(p, param, size);
@@ -235,6 +239,42 @@ void XLink::AddParam(void * param, int size)
 }
 /* 
+get a paramter in default type 
+>> i - id of the parameter
+<< return - the parameter in default type
+*/
+DTYPE XLink::GetParam(int i)
+{
+    CheckNTErrors(params != NULL, "parameter array cannot be empty!");
+    char * p = (char*)params + i * paramSize;
+    return *(DTYPE*)p;
+}
+/* 
+get a paramter in integer 
+>> i - id of the parameter
+<< return - the parameter in integer
+*/
+int XLink::GetParamInt(int i)
+{
+    CheckNTErrors(params != NULL, "parameter array cannot be empty!");
+    char * p = (char*)params + i * paramSize;
+    return *(int*)p;
+}
+/*
+get a parameter in MATRIX_TRANS_TYPE
+>> i - id of the parameter
+<< return - the parameter in MATRIX_TRANS_TYPE
+*/
+MATRIX_TRANS_TYPE XLink::GetParamTrans(int i)
+{
+    CheckNTErrors(params != NULL, "parameter array cannot be empty!");
+    char * p = (char*)params + i * paramSize;
+    return *(MATRIX_TRANS_TYPE*)p;
+}
+/* 
 create a hyperedge with two input tensors and a output tensor 
 >> t1 - a tail tensor
 >> t2 - another tail tensor
@@ -288,14 +328,44 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
 }
 /* 
+create a hyper edge with a input tensors and a list of output tensors
+>> h - a input tensor
+>> list - a list of output tensors
+>> id - id of the edge type
+*/
+void XLink::MakeLink(XTensor * t, XList * list, int id)
+{
+    /* forward */
+    for(int i = 0; i < list->count; i++){
+        XTensor * h = (XTensor*)list->GetItem(i);
+        if(h == NULL)
+            continue;
+        XLink &income = h->income;
+        income.Reset();
+        income.SetHead(h);
+        income.SetType(id);
+        income.AddTail(t);
+    }
+    /* backward */
+    XLink &outgo = t->outgo;
+    CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
+    for(int i = 0; i < list->count; i++){
+        XTensor * t = (XTensor*)list->GetItem(i);
+        if(t == NULL)
+            continue;
+        outgo.AddTail(t);
+    }
+}
+/* 
 add parameters 
 >> h - head
 >> param - parameter we want introduce
 */
 void XLink::AddParamToHead(XTensor * h, DTYPE param)
 {
-    if(h != NULL)
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
-        return;
    h->income.AddParam(param);
 }
@@ -306,8 +376,7 @@ add an integer parameter
 */
 void XLink::AddParamToHeadInt(XTensor * h, int param)
 {
-    if(h != NULL)
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
-        return;
    h->income.AddParam(&param, sizeof(int));
 }
@@ -318,8 +387,7 @@ add a MATRIX_TRANS_TYPE parameter
 */
 void XLink::AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param)
 {
-    if(h != NULL)
+    CheckNTErrors(h != NULL, "head tensor cannot be empty!");
-        return;
    h->income.AddParam(&param, sizeof(MATRIX_TRANS_TYPE));
 }
@@ -376,6 +444,11 @@ void XLink::Replace(const XTensor * oldOne, XTensor * newOne)
        newIncome.tailNum = oldOne->income.tailNum;
        memcpy(newIncome.tails, oldOne->income.tails, sizeof(XTensor*) * newIncome.tailNum);
+        int paraArraySize = oldOne->income.paramNum * oldOne->income.paramSize;
+        newIncome.params = new char[paraArraySize];
+        memcpy(newIncome.params, oldOne->income.params, paraArraySize);
+        newIncome.paramNum = oldOne->income.paramNum;
        /* update the link to each child node */
        for(int i = 0; i < newIncome.tailNum; i++){
            XTensor * child = newIncome.tails[i];

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -34,6 +34,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 struct XTensor;
 #define MAX_OP_NAME_LENGTH 16
+#define PARAM_UNTI_SIZE    64
 /*
 This defines the link among tensors in networks. XLink can be
@@ -115,12 +116,21 @@ struct XLink
    /* add two tails in one time */
    void AddTwoTails(XTensor * t1, XTensor * t2);
-    /* add a integer parameter */
+    /* add a parameter in default type */
    void AddParam(DTYPE param);
-    /* add a integer parameter */
+    /* add a parameter */
    void AddParam(void * param, int size);
+    /* get a paramter in default type */
+    DTYPE GetParam(int i);
+    /* get a paramter in integer */
+    int GetParamInt(int i);
+    /* get a parameter in MATRIX_TRANS_TYPE */
+    MATRIX_TRANS_TYPE GetParamTrans(int i);
    /* create a hyper edge with two input tensors and a output tensor */
    static
    void MakeLink(const XTensor * t1, const XTensor * t2, XTensor * h, int id);
@@ -129,6 +139,10 @@ struct XLink
    static
    void MakeLink(const XList * list, XTensor * h, int id);
+    /* create a hyper edge with a input tensors and a list of output tensors */
+    static
+    void MakeLink(XTensor * h, XList * list, int id);
    /* add a parameter */
    static
    void AddParamToHead(XTensor * h, DTYPE param);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -26,57 +26,81 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get operator name */
 const char * GetOPName(int type)
 {
-    if((type & MATH_ARITHMETIC) != 0){
+    if ((type & MATH_BASE) != 0){
-        if(type == MATH_ABSOLUTE)
+        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
-        else if(type == MATH_MATRIXMUL)
+        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
-        else if(type == MATH_MATRIXMULBATCHED)
+        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
-        else if(type == MATH_MULTIPLY)
+        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
-        else if(type == MATH_NEGATE)
+        else if (type == MATH_NEGATE)
            return "M_NEGATE";
-        else if(type == MATH_SIGN)
+        else if (type == MATH_SIGN)
            return "M_SIGN";
-        else if(type == MATH_SUM)
+        else if (type == MATH_SUM)
            return "M_SUM";
-        else if(type == MATH_LOG)
+        else if (type == MATH_LOG)
-            return "M_NORMALIZE";
-        else if(type == MATH_NORMALIZE)
            return "M_LOG";
-        else if(type == MATH_POWER)
+        else if (type == MATH_NORMALIZE)
+            return "M_NORMALIZE";
+        else if (type == MATH_POWER)
            return "M_POWER";
-        else if(type == MATH_SCALEANDSHIFT)
+        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
-        else if(type == GETANDSET_SELECT)
+        else if (type == REDUCE_REDUCEMAX)
-            return "G_SELECT";
-        else if(type == MOVEMENT_COPYINDEXED)
-            return "M_COPYINDEXED";
-        else if(type == MOVEMENT_COPYVALUES)
-            return "M_COPYVALUES";
-        else if(type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
-        else if(type == REDUCE_REDUCEMEAN)
+        else if (type == REDUCE_REDUCEMEAN)
            return "R_REDUCEMEAN";
-        else if(type == REDUCE_REDUCESUM)
+        else if (type == REDUCE_REDUCESUM)
            return "R_REDUCESUM";
-        else if(type == REDUCE_REDUCESUMSQUARED)
+        else if (type == REDUCE_REDUCESUMSQUARED)
            return "R_REDUCESUMSQUARED";
-        else if(type == REDUCE_REDUCEVARIANCE)
+        else if (type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
-        else if(type == SHAPE_CONCATENATE)
+    }
+    else if ((type & DATA_BASE) != 0){
+        if (type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if (type == MOVEMENT_COPYINDEXED)
+            return "M_COPYINDEXED";
+        else if (type == MOVEMENT_COPYVALUES)
+            return "M_COPYVALUES";
+        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
-        else if(type == SHAPE_MERGE)
+        else if (type == SHAPE_MERGE)
            return "S_MERGE";
-        else if(type == SHAPE_PERMUTE)
+        else if (type == SHAPE_MERGE_LIST)
+            return "S_MERGE_LIST";
+        else if (type == SHAPE_PERMUTE)
            return "S_PERMUTE";
-        else if(type == SHAPE_SPLIT)
+        else if (type == SHAPE_SPLIT)
            return "S_SPLIT";
-        else if(type == SHAPE_TRANSPOSE)
+        else if (type == SHAPE_SPLIT_LIST)
+            return "S_SPLIT_LIST";
+        else if (type == SHAPE_TRANSPOSE)
            return "S_TRANSPOSE";
-        else if(type == SHAPE_UNSQUEEZE)
+        else if (type == SHAPE_UNSQUEEZE)
            return "S_UNSQUEEZE";
+        else if (type == SORT_SORT)
+            return "S_SORT";
+        else if (type == SORT_TOPK)
+            return "S_TOPK";
+    }
+    else if ((type & FUNCTION_BASE) != 0){
+        if (type == FUNC_HARDTANH)
+            return "F_HARDTANH";
+        else if (type == FUNC_IDENTITY)
+            return "F_IDENTITY";
+        else if (type == FUNC_LOGSOFTMAX)
+            return "F_LOGSOFTMAX";
+        else if (type == FUNC_RECTIFY)
+            return "F_RECTIFY";
+        else if (type == FUNC_SIGMOID)
+            return "F_SIGMOID";
+        else if (type == FUNC_SOFTMAX)
+            return "F_SOFTMAX";
    }
    return "NULL";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -28,8 +28,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-#define MATH_ARITHMETIC     0x00001000
+/* math operations */
-#define MATH_ABSOLUTE           MATH_ARITHMETIC + 1
+#define MATH_BASE               0x00001000
+#define MATH_ABSOLUTE           MATH_BASE + 1
 #define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
 #define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
 #define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
@@ -42,28 +43,45 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_POWER              MATH_NORMALIZE + 1
 #define MATH_SCALEANDSHIFT      MATH_POWER + 1
-#define GETANDSET               MATH_SCALEANDSHIFT + 1
+#define REDUCE                  MATH_SCALEANDSHIFT + 1
-#define GETANDSET_SELECT        GETANDSET + 1
-#define MOVEMENT                GETANDSET_SELECT + 1
-#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
-#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
-#define REDUCE                  MOVEMENT_COPYVALUES + 1
 #define REDUCE_REDUCEMAX        REDUCE + 1
 #define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
 #define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
 #define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
 #define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
-#define SHAPE                   REDUCE_REDUCEVARIANCE + 1
+/* data and shape related operations */
+#define DATA_BASE               MATH_BASE * 2
+#define GETANDSET               DATA_BASE + 1
+#define GETANDSET_SELECT        GETANDSET + 1
+#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
+#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
+#define SHAPE                   MOVEMENT_COPYVALUES + 1
 #define SHAPE_CONCATENATE       SHAPE + 1
 #define SHAPE_MERGE             SHAPE_CONCATENATE + 1
-#define SHAPE_PERMUTE           SHAPE_MERGE + 1
+#define SHAPE_MERGE_LIST        SHAPE_MERGE + 1
+#define SHAPE_PERMUTE           SHAPE_MERGE_LIST + 1
 #define SHAPE_SPLIT             SHAPE_PERMUTE + 1
-#define SHAPE_TRANSPOSE         SHAPE_SPLIT + 1
+#define SHAPE_SPLIT_LIST        SHAPE_SPLIT + 1
+#define SHAPE_TRANSPOSE         SHAPE_SPLIT_LIST + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1
+#define SORT                    SHAPE_UNSQUEEZE + 1
+#define SORT_SORT               SORT + 1
+#define SORT_TOPK               SORT_SORT + 1
+/* activation functions */
+#define FUNCTION_BASE           DATA_BASE * 2
+#define FUNC_HARDTANH           FUNCTION_BASE + 1
+#define FUNC_IDENTITY           FUNC_HARDTANH + 1
+#define FUNC_LOGSOFTMAX         FUNC_IDENTITY + 1
+#define FUNC_RECTIFY            FUNC_LOGSOFTMAX + 1
+#define FUNC_SIGMOID            FUNC_RECTIFY + 1
+#define FUNC_SOFTMAX            FUNC_SIGMOID + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -134,8 +134,6 @@ constructor
 XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYPE myDataType,
                 const float myDenseRatio, int myDevID, XMem * myMem)
 {
-    CheckNTErrors((myOrder > 0), "Illegal tensor order1");
    Init();
    SetDataPointer();
@@ -144,7 +142,8 @@ XTensor::XTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_TYP
    mem = myMem;
    devID = myMem != NULL ? myMem->devID : myDevID;
-    Resize(myOrder, myDimSize, myDataType, myDenseRatio);
+    if(order >= 0)
+        Resize(myOrder, myDimSize, myDataType, myDenseRatio);
 }
 /* copy constructor */
@@ -211,6 +210,9 @@ XTensor::~XTensor()
    XLink::ClearIncoming(this);
    DestroyData();
+    if(grad != NULL)
+        delete grad;
 }
 /* initialize member variables */
@@ -237,7 +239,9 @@ void XTensor::Init()
    memset(isAllValued, 0, sizeof(bool) * MAX_TENSOR_DIM_NUM);
    isInit = false;
    isTmp =  false;
+    isGrad = false;
    visitMark = 0;
+    grad = NULL;
 }
 /* delete data arrays */
@@ -294,7 +298,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
    }
    else{
        DestroyData();
-        if(isInit){
+        if(!isInit){
            devID = tensor.devID;
            mem = tensor.mem;
        }
@@ -347,6 +351,9 @@ judge whether the two matrices are in the same type and size
 */
 bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
 {
+    if(a == NULL || b == NULL)
+        return false;
    if(a->order != b->order)
        return false;
@@ -1043,7 +1050,7 @@ int XTensor::GetNonzeroSize()
 /* 
 set the tensor as "temporary" 
->> myIsTMP - flag
+>> myIsTMP - the flag
 */
 void XTensor::SetTMP(bool myIsTmp)
 {
@@ -1051,6 +1058,15 @@ void XTensor::SetTMP(bool myIsTmp)
 }
 /* 
+set the tensor as "keep-gradient" 
+>> myIsGrad - the flag
+*/
+void XTensor::SetGrad(bool myIsGrad)
+{
+    isGrad = myIsGrad;
+}
+/* 
 resize a tensor with a specified tensor size
 >> myOrder - order of the tensor
 >> myDimSize - the size of each dimension
@@ -1105,7 +1121,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    if(isSparse){
        /*
        for sparse matrices, we use a list of tuple (key, value), 
-        ordered by key. Take a (2-dimensional) matrice as examples, 
+        ordered by key. Take a (2-dimensional) matrix as an example, 
        we have key = m * i + j;
        The data array is
        ---------
@@ -1148,9 +1164,9 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
        if(filledData){
            /* allocate the new one */
            if(mem == NULL){
-                data = (void*)new char[unitNum * unitSize];
+                data = XMemAlloc(devID, unitNum * unitSize); 
 #if defined(UNSAFE_BUT_FAST_MEM)
-                memset(data, 0, unitNum * unitSize);
+                XMemSet(devID, data, 0, unitNum * unitSize);
 #endif
            }
            else
@@ -1982,10 +1998,12 @@ generate a copy of XTensor
 */
 XTensor * NewTensor(XTensor * a, bool isFilledData)
 {
+    int dims[MAX_TENSOR_DIM_NUM];
    CheckNTErrors((a != NULL), "Empty input!");
-    int * dims = new int[a->order];
+    if(a->order > 0)
-    memcpy(dims, a->dimSize, sizeof(int) * a->order);
+        memcpy(dims, a->dimSize, sizeof(int) * a->order);
    if(!isFilledData)
        dims[0] = -dims[0];
@@ -1994,8 +2012,6 @@ XTensor * NewTensor(XTensor * a, bool isFilledData)
                                      a->dataType, a->denseRatio,
                                      a->devID, a->mem);
-    delete[] dims;
    return newTensor;
 }

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -139,8 +139,14 @@ public:
    /* indicates whether the tensor is created temporarily */
    bool isTmp;
+    /* indicates whether the tensor keeps the gradient when used as model parameters */
+    bool isGrad;
    /* mark for traversing the gragh */
    unsigned int visitMark;
+    /* gradient (for back-propagation) */
+    XTensor * grad;
    /*
    the link used to form networks. Note that when we compute on tensors, we actually create a
@@ -300,6 +306,9 @@ public:
    /* set the tensor as "temporary" */
    void SetTMP(bool myIsTmp = true);
+    /* set the tensor as "keep-gradient" */
+    void SetGrad(bool myIsGrad = true);
    /* resize a matrix with a specified matrix size */
    bool Resize(const int myOrder, const int * myDimSize,
                const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE,

--- a/source/tensor/XThread.h
+++ b/source/tensor/XThread.h
@@ -32,9 +32,6 @@
 #define USE_PTHREAD // for linux
 #endif
-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
 //////////////////////////////////////////////////
 // neccessary libs
 #ifdef USE_PTHREAD
@@ -46,12 +43,15 @@ namespace nts{
 #endif
 #endif
+/* the nts (NiuTrans.Tensor) namespace */
+namespace nts{
 #if(defined(_WIN32) && !defined (__CYGWIN__))
 #define CRFPP_USE_THREAD 1
 #define BEGINTHREAD(src, stack, func, arg, flag, id) \
-     (HANDLE)_beginthreadex((void *)(src), (unsigned)(stack), \
+                   (HANDLE)_beginthreadex((void *)(src), (unsigned)(stack), \
-                            (unsigned(_stdcall *)(void *))(func), (void *)(arg), \
+                   (unsigned(_stdcall *)(void *))(func), (void *)(arg), \
-                            (unsigned)(flag), (unsigned *)(id))
+                   (unsigned)(flag), (unsigned *)(id))
 #endif
 //////////////////////////////////////////////////

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -176,12 +176,16 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
    }
 #ifdef USE_CUDA
    else if(devIDT >= 0 && devIDS < 0){
-        CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyHostToDevice) == cudaSuccess),
+        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyHostToDevice);
-                            "cudaMemcpy error (cudaMemcpyHostToDevice)");
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyHostToDevice)");
+        }
    }
    else if(devIDT < 0 && devIDS >= 0){
-        CheckNTErrors((cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost) == cudaSuccess),
+        cudaError_t error = cudaMemcpy(t, s, size, cudaMemcpyDeviceToHost);
-                            "cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        if(error != cudaSuccess){
+            ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+        }
    }
    else{
        //if(devIDT == devIDS){
@@ -482,8 +486,9 @@ quick sorting
            NOTE: this means that the items may not placed in a continuous memory space
 >> comp - the comparison function 
 */
-void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
+void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
 {
+    XMemCopy(dataB, -1, dataA, -1, num * width);
    char *lo, *hi;         // ends of sub-array currently sorting
    int *indexlo, *indexhi;
    char *mid;             // points to middle of subarray
@@ -502,8 +507,8 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com
    stackptr = 0;
-    lo = (char*)data;
+    lo = (char*)dataB;
-    hi = (char*)data + realStride * (num - 1);
+    hi = (char*)dataB + realStride * (num - 1);
    indexlo = (int*)index;
    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
 extern double GetClock();
 extern double GetClockSec();
-extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
+extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
 extern int CompXFloat(const void * a, const void * b);
 #ifdef USE_CUDA

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -21,6 +21,7 @@
 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Absolute.h"
 #include "Absolute.cuh"
@@ -28,21 +29,54 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 set every entry to its absolute value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Absolute(XTensor * a)
+void _Absolute(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaAbsolute(a);
+        _CudaAbsolute(a, b);
    return;
 }
 #endif
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = (DTYPE)fabs(d[i]);
+        db[i] = (DTYPE)fabs(d[i]);
+}
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _AbsoluteMe(XTensor * a)
+{
+    _Absolute(a, a);
+}
+/*
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the absolute value of input tensor
+*/
+XTensor Absolute(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* call _Absolute function */
+    _Absolute(&a, &b);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
+    return b;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its absolute value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(DTYPE * d, int size)
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = fabs(d[i]);
+        b[i] = fabs(a[i]);
 }
 /*
 set each entry to its absolute value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(__half * d, int size)
+void KernelAbsolute(__half * a, __half * b, int size)
 {
    return;
 }
 /*
-set each entry to its  with float16 data type value
+set each entry to its absolute value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaAbsolute(XTensor * a)
+void _CudaAbsolute(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];
@@ -74,10 +78,10 @@ void _CudaAbsolute(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
-        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -27,15 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set each entry to its absolute value (CUDA Kernel) */
 __global__
-void KernelAbsolute(DTYPE * d, int size);
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size);
 /* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelAbsolute(__half * d, int size);
+void KernelAbsolute(__half * a, __half * b, int size);
 /* set each entry to its absolute value */
 extern "C"
-void _CudaAbsolute(XTensor * a);
+void _CudaAbsolute(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its absolute value */
-extern "C"
+void _Absolute(const XTensor * a, XTensor * b);
-void _Absolute(XTensor * a);
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _AbsoluteMe(XTensor * a);
+/* 
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Absolute(const XTensor & a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
@@ -38,11 +38,11 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> beta - scalar
 */
 void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                         const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          const XList * b, MATRIX_TRANS_TYPE transposedB,
-                         XList * c, DTYPE alpha, DTYPE beta)
+                          XList * c, DTYPE alpha, DTYPE beta)
 {
-    CheckNTErrors((a && b && c), "Empty input lists!");
+    CheckNTErrors(a && b && c, "Empty input lists!");
-    CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
+    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");
    if (a->count == 0)
        return;

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -28,8 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication in batch mode (CPU code) */
 extern "C"
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -30,7 +30,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-matrix multiplication
+matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
 For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
 E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
@@ -66,8 +66,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cn = c->dimSizeRDI[1];
    int cm = c->dimSizeRDI[0];
-    CheckNTErrors((am == bn && an == cn && bm == cm),
+    CheckNTErrors((am == bn && an == cn && bm == cm), "Unmatched tensors in multiplication!");
-        "Unmatched tensors in multiplication!");
    int aBlockSize = a->dimSizeRDI[0] * a->dimSizeRDI[1];
    int bBlockSize = b->dimSizeRDI[0] * b->dimSizeRDI[1];
@@ -80,13 +79,13 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cBlockNum = 1;
    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
+        CheckNTErrors(a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order], "Incorrect tensor sizes!");
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }
    for (int i = 2; i < b->order; i++) {
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors(b->dimSizeRDI[i] == c->dimSizeRDI[i], "Incorrect tensor sizes!");
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }
@@ -186,7 +185,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 }
 /*
-matrix multiplication (return a XTensor structure)
+matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
 For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
@@ -203,14 +202,13 @@ Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x
 >> b - tensor b
 >> transposedB - indicates whether teh matrices in b are transposed
 >> alpha - a coefficient
->> beta - another coefficient
 >> parallelRunner - parallel processing module
 << return - the result of matrix multiplication
 */
-XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, 
-                 DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+                  const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                  DTYPE alpha, XPRunner * parallelRunner)
 {
-    CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
@@ -224,10 +222,10 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
    int order = a.order + b.order - 2;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < b.order; i++)
-        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];
    for (int i = 2; i < a.order; i++)
        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+    for (int i = 2; i < b.order; i++)
+        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
@@ -236,14 +234,65 @@ XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor
    c.SetTMP();
    /* call _MatrixMul function */
-    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
    XLink::AddParamToHeadTrans(&c, transposedA);
    XLink::AddParamToHeadTrans(&c, transposedB);
    XLink::AddParamToHead(&c, alpha);
-    XLink::AddParamToHead(&c, beta);
+    /* destroy variables */
+    delete[] dimSize;
+    return c;
+}
+/* 
+matrix multiplication with no transposition c = a * b * alpha
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MatrixMul(const XTensor &a, const XTensor &b, 
+                  DTYPE alpha, XPRunner * parallelRunner)
+{
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
+    int an = a.dimSizeRDI[1];
+    int am = a.dimSizeRDI[0];
+    int bn = b.dimSizeRDI[1];
+    int bm = b.dimSizeRDI[0];
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+    int order = a.order + b.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a.order; i++)
+        dimSize[sub++] = a.dimSizeRDI[a.order + 1 - i];
+    for (int i = 2; i < b.order; i++)
+        dimSize[sub++] = b.dimSizeRDI[b.order + 1 - i];    
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
+    c.SetTMP();
+    /* call _MatrixMul function */
+    _MatrixMul(&a, X_NOTRANS, &b, X_NOTRANS, &c, alpha, 0, parallelRunner);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHeadTrans(&c, X_NOTRANS);
+    XLink::AddParamToHead(&c, alpha);
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -26,8 +26,10 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
+#define MMul MatrixMul
 /*
-matrix multiplication
+matrix multiplication c = trans(a) * trans(b) * alpha + c * beta
 For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
 E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
@@ -42,7 +44,7 @@ void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor 
                DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 /* 
-matrix multiplication (return a XTensor structure)
+matrix multiplication (return a XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor c to keep the result and return it
 For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
@@ -55,8 +57,13 @@ C should be a tensor of z * x * n * m.
 Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
 XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
-                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
+/* matrix multiplication with no transposition c = a * b * alpha*/
+XTensor MatrixMul(const XTensor &a, const XTensor &b, 
+                  DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -45,9 +45,9 @@ where trans() return the transposed matrix if the flag is fired
 >> stream - the string for creating the job pipeline
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                 const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                 XTensor * c, DTYPE alpha, DTYPE beta,
+                  XTensor * c, DTYPE alpha, DTYPE beta,
-                 XPRunner * parallelRunner, XStream * stream)
+                  XPRunner * parallelRunner, XStream * stream)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType), "Input tensors should                have the same data type!");

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -122,9 +122,8 @@ where trans() return the transposed matrix if the flag is fired
 >> stream - the string for creating the job pipeline
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                     const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                     XTensor * c,
+                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
-                     DTYPE alpha, DTYPE beta, XStream * stream)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
@@ -158,9 +157,12 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
            cublasSetStream(*handle, stream->stream);
        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
+            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, 
-                a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
+                               b->data, transposedB, a->dataType, c->data, c->dataType,
-                alpha, beta);
+                               a->dimSize[0], a->dimSize[1], 
+                               b->dimSize[0], b->dimSize[1], 
+                               c->dimSize[0], c->dimSize[1],
+                               alpha, beta);
        }
        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -44,7 +44,7 @@ where trans() return the transposed matrix if the flag is fired
 */
 extern "C"
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-                 DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
+                  DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -41,8 +41,8 @@ where trans() return the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 */
 void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                         const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                         XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+                          XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
@@ -32,8 +32,8 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired.
 */
 extern "C"
-void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -46,9 +46,8 @@ where trans() returns the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 */
 void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                       const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta,
+                       XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
-                      XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
@@ -156,6 +155,7 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 /*
 matrix multiplication of the two tensors (do it on site)
+c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
 for each 2-dimensional data array in a (denoted as ai) and
@@ -168,14 +168,12 @@ where trans() returns the transposed matrix if the flag is fired.
 >> b - tensor b
 >> transposedB - indicates whether teh matrices in b are transposed
 >> alpha - a coefficient
->> beta - another coefficient
 >> parallelRunner - parallel processing module
 << return - the result of matrix multiplication of the two tensors
 */
 XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                        DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+                         DTYPE alpha, XPRunner * parallelRunner)
 {
-    CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
@@ -190,24 +188,23 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    int order = a.order;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
+    for (int i = 0; i < a.order - 2; i++)
-        dimSize[sub++] = a.dimSizeRDI[i];
+        dimSize[sub++] = a.dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;
-    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
-    c.SetZeroAll();
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();
    /*call _MatrixMulBatched function */
-    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, 0, parallelRunner);
    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
    XLink::AddParamToHeadTrans(&c, transposedA);
    XLink::AddParamToHeadTrans(&c, transposedB);
    XLink::AddParamToHead(&c, alpha);
-    XLink::AddParamToHead(&c, beta);
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-matrix multiplication of the two tensors
+matrix multiplication of the two tensors c = trans(a) * trans(b) * alpha + c * beta
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
@@ -35,10 +35,10 @@ ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 */
 void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-    XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                       XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 /*
-matrix multiplication of the two tensors (return a XTensor structure)
+matrix multiplication of the two tensors (return a XTensor structure) c = trans(a) * trans(b) * alpha
 make a new tensor to keep the result and return it
 for each 2-dimensional data array in a (denoted as ai) and
@@ -47,7 +47,7 @@ ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 */
 XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+                         DTYPE alpha = (DTYPE)1.0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -142,16 +142,15 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor c to keep the result and return it
-c(i) = a(i)*b(i) + \alpha * c(i)
+c(i) = a(i)*b(i)
 where i is the index of the item
 >> a - tensor a
 >> b - tensor b
->> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
 << return - the product of the tensors
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim)
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim)
 {
    CheckNTErrors(a.dimSize[leadingDim] == b.dimSize[leadingDim], "TODO!");
@@ -159,11 +158,10 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    c.SetTMP();
    /* call _Multiply function */
-    _Multiply(&a, &b, &c, alpha, leadingDim);
+    _Multiply(&a, &b, &c, 0, leadingDim);
    /* tensor connections */
    XLink::MakeLink(&a, &b, &c, MATH_MULTIPLY);
-    XLink::AddParamToHead(&c, alpha);
    XLink::AddParamToHeadInt(&c, leadingDim);
    return c;

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -44,10 +44,10 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim
 /* 
 element-wise product of two tensors (return a XTensor structure)
 make a new tensor to keep the result and return it
-c(i) = a(i)*b(i) + \alpha * c(i) 
+c(i) = a(i)*b(i)
 where i is the index of the element 
 */
-XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
+XTensor Multiply(const XTensor &a, const XTensor &b, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -20,6 +20,7 @@
 */
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Negate.h"
 #include "Negate.cuh"
@@ -27,21 +28,55 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 set every entry to its minus value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Negate(XTensor * a)
+void _Negate(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaNegate(a);
+        _CudaNegate(a, b);
    return;
-}
+    }
 #endif
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = -d[i];
+        db[i] = -d[i];
+}
+/*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _NegateMe(XTensor * a)
+{
+    _Negate(a, a);
 }
+/*
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the minus value of input tensor
+*/
+XTensor Negate(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* call _Negate function */
+    _Negate(&a, &b);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    return b;
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -29,45 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its negtive value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to the input data array
+>> b - pointer to the output data array
 >> size - size of the data array
 */
 __global__
-void KernelNegate(DTYPE * d, int size)
+void KernelNegate(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = -d[i];
+        b[i] = -a[i];
 }
 /*
 set each entry to its negtive value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to the input data array
+>> b - pointer to the output data array
 >> size - size of the data array
 */
 __global__
-void KernelNegate(__half * d, int size)
+void KernelNegate(__half * a, __half * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
        if (i < size)
-            d[i] = __hsub(__float2half(0), d[i]);
+            b[i] = __hsub(__float2half(0), a[i]);
 #else
        if (i < size)
-            d[i] = __float2half(-__half2float(d[i]));
+            b[i] = __float2half(-__half2float(a[i]));
 #endif
 }
 /*
 set each entry to its negtive value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaNegate(XTensor * a)
+void _CudaNegate(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];
@@ -82,10 +86,10 @@ void _CudaNegate(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set each entry to its negtive value (CUDA Kernel) */
 __global__
-void KernelNegate(DTYPE * d, int size);
+void KernelNegate(DTYPE * a, DTYPE * b, int size);
 /* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelNegate(__half * d, int size);
+void KernelNegate(__half * a, __half * b, int size);
 /* set each entry to its negtive value */
 extern "C"
-void _CudaNegate(XTensor * a);
+void _CudaNegate(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its minus value */
-extern "C"
+void _Negate(const XTensor * a, XTensor * b);
-void _Negate(XTensor * a);
+/* 
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _NegateMe(XTensor * a);
+/* 
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Negate(const XTensor & a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -20,6 +20,7 @@
 */
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Sign.h"
 #include "Sign.cuh"
@@ -27,27 +28,60 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 set every entry to its sign value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Sign(XTensor * a)
+void _Sign(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaSign(a);
+        _CudaSign(a, b);
    return;
 }
 #endif
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++) {
        if (d[i] > 0)
-            d[i] = 1.0F;
+            db[i] = 1.0F;
        else if (d[i] == 0)
-            d[i] = 0.0F;
+            db[i] = 0.0F;
        else
-            d[i] = -1.0F;
+            db[i] = -1.0F;
    }
 }
+/*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _SignMe(XTensor * a)
+{
+    _Sign(a, a);
+}
+/*
+set every entry to its sign value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the sign value of the input tensor
+*/
+XTensor Sign(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* call _ScaleAndShift function */
+    _Sign(&a, &b);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+    return b;
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -29,43 +29,47 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its sign value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelSign(DTYPE * d, int size)
+void KernelSign(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size) {
-        if (d[i] > 0)
+        if (a[i] > 0)
-            d[i] = 1.0F;
+            b[i] = 1.0F;
-        else if (d[i] == 0)
+        else if (a[i] == 0)
-            d[i] = 0.0F;
+            b[i] = 0.0F;
        else
-            d[i] = -1.0F;
+            b[i] = -1.0F;
    }
 }
 /*
-set each entry to its sign value (CUDA Kernel)
+set each entry to its sign value with float16 data type value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelSign(__half * d, int size)
+void KernelSign(__half * a, __half * b, int size)
 {
    return;
 }
 /*
-set each entry to its  with float16 data type value
+set each entry to its sign value
->> a - the tensor
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
 extern "C"
-void _CudaSign(XTensor * a)
+void _CudaSign(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];
@@ -80,10 +84,10 @@ void _CudaSign(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
-        KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set each entry to its sign value (CUDA Kernel) */
 __global__
-void KernelSign(DTYPE * d, int size);
+void KernelSign(DTYPE * a, DTYPE * b, int size);
 /* set each entry to its sign value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelSign(__half * d, int size);
+void KernelSign(__half * a, __half * b, int size);
 /* set each entry to its sign value */
 extern "C"
-void _CudaSign(XTensor * a);
+void _CudaSign(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its sign value */
-extern "C"
+void _Sign(const XTensor * a, XTensor * b);
-void _Sign(XTensor * a);
+/* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _SignMe(XTensor * a);
+/* 
+set every entry to its sign value  (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Sign(const XTensor & a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -37,8 +37,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta
 >> c - output matrix (2d tensor)
 */
 void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                   const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                  XTensor * c, DTYPE alpha, DTYPE beta)
+                   XTensor * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -32,11 +32,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMUL(cublasHandle_t * handle,
-                       const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                        const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-                       const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                        const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-                       void * c, TENSOR_DATA_TYPE dataTypeC,
+                        void * c, TENSOR_DATA_TYPE dataTypeC,
-                       int na, int ma, int nb, int mb, int nc, int mc,
+                        int na, int ma, int nb, int mb, int nc, int mc,
-                       DTYPE alpha, DTYPE beta)
+                        DTYPE alpha, DTYPE beta)
 {
    /*
    matrxi-matrix multiplication
@@ -89,11 +89,11 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
-                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                               const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                               const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-                              void ** c, TENSOR_DATA_TYPE dataTypeC,
+                               void ** c, TENSOR_DATA_TYPE dataTypeC,
-                              int count, int na, int ma, int nb, int mb, int nc, int mc,
+                               int count, int na, int ma, int nb, int mb, int nc, int mc,
-                              DTYPE alpha, DTYPE beta)
+                               DTYPE alpha, DTYPE beta)
 {
    /*
    matrxi-matrix multiplication
@@ -145,11 +145,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
-                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
+                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
-                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
+                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
-                                     void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
+                                      void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
-                                     int count, int na, int ma, int nb, int mb, int nc, int mc,
+                                      int count, int na, int ma, int nb, int mb, int nc, int mc,
-                                     DTYPE alpha, DTYPE beta)
+                                      DTYPE alpha, DTYPE beta)
 {
    /*
    matrxi-matrix multiplication
@@ -202,10 +202,10 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULList(cublasHandle_t * handle,
-                           const XList * a, MATRIX_TRANS_TYPE transposedA,
+                            const XList * a, MATRIX_TRANS_TYPE transposedA,
-                           const XList * b, MATRIX_TRANS_TYPE transposedB,
+                            const XList * b, MATRIX_TRANS_TYPE transposedB,
-                           XList * c,
+                            XList * c,
-                           int count, DTYPE alpha, DTYPE beta)
+                            int count, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");
    CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
@@ -298,12 +298,12 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
            cudaMemcpy(cpGPU, cp, sizeof(DTYPE*) * c->count, cudaMemcpyHostToDevice);
            _CudaBLASMatrixMULBatched(handle,
-                                    (const void**)apGPU, transposedA, a0->dataType,
+                                     (const void**)apGPU, transposedA, a0->dataType,
-                                    (const void**)bpGPU, transposedB, b0->dataType,
+                                     (const void**)bpGPU, transposedB, b0->dataType,
-                                    (void**)cpGPU, c0->dataType, a->count,
+                                     (void**)cpGPU, c0->dataType, a->count,
-                                    a0->dimSize[0], a0->dimSize[1],
+                                      a0->dimSize[0], a0->dimSize[1],
-                                    b0->dimSize[0], b0->dimSize[1],
+                                      b0->dimSize[0], b0->dimSize[1],
-                                    c0->dimSize[0], c0->dimSize[1], alpha, beta);
+                                      c0->dimSize[0], c0->dimSize[1], alpha, beta);
            delete[] ap;
            delete[] bp;
            delete[] cp;

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -28,38 +28,42 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication (BLAS) */
 extern "C"
-void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                   XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 #ifdef USE_CUDA
 /* matrix multiplication via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMUL(cublasHandle_t * handle,
-    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                        const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                        const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-    void * c, TENSOR_DATA_TYPE dataTypeC,
+                        void * c, TENSOR_DATA_TYPE dataTypeC,
-    int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                        int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
-    const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                               const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                               const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-    void ** c, TENSOR_DATA_TYPE dataTypeC,
+                               void ** c, TENSOR_DATA_TYPE dataTypeC,
-    int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                               int count, int na, int ma, int nb, int mb, int nc, int mc, 
+                               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
-    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
+                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
-    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
+                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
-    void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
+                                      void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
-    int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                                      int count, int na, int ma, int nb, int mb, int nc, int mc, 
+                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
-    int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 #endif
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -96,7 +96,6 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    int order = a.order;
    int * dimSize = new int[order];
-    CheckNTErrors(&a != NULL, "Empty input tensors!");
    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
    CheckNTErrors(low < high, "Illegal range specified!");
@@ -110,8 +109,8 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
            dimSize[i] = a.dimSize[i];
    }
-    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
-    c.SetZeroAll();
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();
    /* call _SelectRange function */

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -21,6 +21,8 @@
 */
 #include "SetData.h"
+#include "SetData.cuh"
+#include "../../XUtility.h"
 #include "../movement/CopyValues.h"
 #if !defined( WIN32 ) && !defined( _WIN32 )
@@ -35,13 +37,151 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* 
+generate data items with a fixed value p 
+>> tensor - the tensor whose data array would be initialized
+>> p - pointer to the number for initializing the tensor
+*/
+void _SetDataFixed(XTensor * tensor, void * valuePointer)
+{
+    int num = tensor->unitNum;
+    if(tensor->dataType == X_INT){
+        int p = *(int*)valuePointer;
+        if(tensor->devID < 0){
+            int * d = (int*)tensor->data;
+            if(num % 4 == 0){
+                for(int i = 0; i < num; i += 4){
+                    d[i] = p;
+                    d[i + 1] = p;
+                    d[i + 2] = p;
+                    d[i + 3] = p;
+                }
+            }
+            else{
+                for(int i = 0; i < num; i++)
+                    d[i] = p;
+            }
+        }
+        else{
+#ifdef USE_CUDA
+            CudaSetDataFixedInt(tensor, p);
+#endif
+        }
+    }
+    else if(tensor->dataType == X_FLOAT){
+        float p = *(float*)valuePointer;
+        if(tensor->devID < 0){
+            float * d = (float*)tensor->data;
+            if(num % 4 == 0){
+                for(int i = 0; i < num; i += 4){
+                    d[i] = p;
+                    d[i + 1] = p;
+                    d[i + 2] = p;
+                    d[i + 3] = p;
+                }
+            }
+            else{
+                for(int i = 0; i < num; i++)
+                    d[i] = p;
+            }
+        }
+        else{
+#ifdef USE_CUDA
+            CudaSetDataFixedFloat(tensor, p);
+#endif
+        }
+    }
+    else if(tensor->dataType == X_DOUBLE){
+        double p = *(double*)valuePointer;
+        if(tensor->devID < 0){
+            double * d = (double*)tensor->data;
+            if(num % 4 == 0){
+                for(int i = 0; i < num; i += 4){
+                    d[i] = p;
+                    d[i + 1] = p;
+                    d[i + 2] = p;
+                    d[i + 3] = p;
+                }
+            }
+            else{
+                for(int i = 0; i < num; i++)
+                    d[i] = p;
+            }
+        }
+        else{
+#ifdef USE_CUDA
+            CudaSetDataFixedDouble(tensor, p);
+#endif
+        }
+    }
+    else{
+        ShowNTErrors("TODO");
+    }
+}
+/* 
+generate data items with a fixed value p (in default type) 
+>> tensor - the tensor whose data array would be initialized
+>> p - number in default type
+*/
+void SetDataFixed(XTensor &tensor, DTYPE p)
+{
+    _SetDataFixed(&tensor, &p);
+}
+/* 
+generate data items with a fixed value p (in integer) 
+>> tensor - the tensor whose data array would be initialized
+>> p - an int-valued number
+*/
+void _SetDataFixedInt(XTensor * tensor, int p)
+{
+    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT");
+    if(p == 0)
+        tensor->SetZeroAll();
+    else
+        _SetDataFixed(tensor, &p);
+}
+/*
+generate data items with a fixed value p (in float) 
+>> tensor - the tensor whose data array would be initialized
+>> p - a float-valued number
+*/
+void _SetDataFixedFloat(XTensor * tensor, float p)
+{
+    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_INT");
+    if(p == 0)
+        tensor->SetZeroAll();
+    else
+        _SetDataFixed(tensor, &p);
+}
+/* 
+generate data items with a fixed value p (in double) 
+>> tensor - the tensor whose data array would be initialized
+>> p - a double-valued number
+*/
+void _SetDataFixedDouble(XTensor * tensor, double p)
+{
+    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_INT");
+    if(p == 0)
+        tensor->SetZeroAll();
+    else
+        _SetDataFixed(tensor, &p);
+}
 /*
 generate data items with a uniform distribution in [low,high]
 >> tensor - the tensor whose data array would be initialized
 >> low - lower value of the range
 >> high - higher value of the range
 */
-void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
+void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
 {
    if(tensor == NULL)
        return;
@@ -76,7 +216,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    */
    else{
        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
-        SetDataRand(t2, low, high);
+        _SetDataRand(t2, low, high);
        _CopyValues(t2, tensor);
        delete t2;
    }

--- a/source/tensor/core/getandset/SetData.cu
+++ b/source/tensor/core/getandset/SetData.cu
+/* 
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
+* I'm surprised that I did not write this file till today.
+*/
+#include "SetData.cuh"
+#include "../../XDevice.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+set an integer data array with a fixed value p (in int) 
+>> d - pointer to the data array
+>> size - size of the array
+>> p - the initial value
+*/
+__global__ 
+void KernelSetDataFixedInt(int * d, int size, int p)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size)
+        d[i] = p;
+}
+/* 
+generate data items with a fixed value p (in int) 
+>> tensor - the tensor for initialization
+>> p - the initial value
+*/
+void CudaSetDataFixedInt(XTensor * tensor, int p)
+{
+    CheckNTErrors(tensor->dataType == X_INT, "the tensor must be in X_INT!");
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    KernelSetDataFixedInt <<<blocks, threads >>>((int*)tensor->data, tensor->unitNum, p);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+/* 
+set a float data array with a fixed value p (in int) 
+>> d - pointer to the data array
+>> size - size of the array
+>> p - the initial value
+*/
+__global__ 
+void KernelSetDataFixedFloat(float * d, int size, float p)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size)
+        d[i] = p;
+}
+/* 
+generate data items with a fixed value p (in float)
+>> tensor - the tensor for initialization
+>> p - the initial value
+*/
+void CudaSetDataFixedFloat(XTensor * tensor, float p)
+{
+    CheckNTErrors(tensor->dataType == X_FLOAT, "the tensor must be in X_FLOAT!");
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    KernelSetDataFixedFloat <<<blocks, threads >>>((float*)tensor->data, tensor->unitNum, p);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+/* 
+set a double data array with a fixed value p (in int) 
+>> d - pointer to the data array
+>> size - size of the array
+>> p - the initial value
+*/
+__global__ 
+void KernelSetDataFixedDouble(double * d, int size, double p)
+{
+    int i = blockDim.x * blockIdx.x + threadIdx.x;
+    if (i < size)
+        d[i] = p;
+}
+/* 
+generate data items with a fixed value p (in double) 
+>> tensor - the tensor for initialization
+>> p - the initial value
+*/
+void CudaSetDataFixedDouble(XTensor * tensor, double p)
+{
+    CheckNTErrors(tensor->dataType == X_DOUBLE, "the tensor must be in X_DOUBLE!");
+    int gridSize[3];
+    int blockSize[3];
+    GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
+    dim3 blocks(gridSize[0]);
+    dim3 threads(blockSize[0]);
+    int devIDBackup;
+    ProtectCudaDev(tensor->devID, devIDBackup);
+    KernelSetDataFixedDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, p);
+    BacktoCudaDev(tensor->devID, devIDBackup);
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/SetData.cuh
+++ b/source/tensor/core/getandset/SetData.cuh
+/* 
+* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-18
+* I'm surprised that I did not write this file till today.
+*/
+#ifndef __SETDATA_CUH__
+#define __SETDATA_CUH__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* generate data items with a fixed value p (in int) */
+void CudaSetDataFixedInt(XTensor * tensor, int p);
+/* generate data items with a fixed value p (in float) */
+void CudaSetDataFixedFloat(XTensor * tensor, float p);
+/* generate data items with a fixed value p (in double) */
+void CudaSetDataFixedDouble(XTensor * tensor, double p);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __SETDATA_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/SetData.h
+++ b/source/tensor/core/getandset/SetData.h
@@ -28,28 +28,25 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* generate data items with a fixed value p */
-extern "C"
+void _SetDataFixed(XTensor * tensor, void * valuePointer);
-void SetDataFixed(XTensor * tensor, void * valuePointer);
+/* generate data items with a fixed value p (in default type) */
+void SetDataFixed(XTensor &tensor, DTYPE p);
 /* generate data items with a fixed value p (in int) */
-extern "C"
+void _SetDataFixedInt(XTensor * tensor, int p);
-void SetDataFixedInt(XTensor * tensor, int p);
 /* generate data items with a fixed value p (in float) */
-extern "C"
+void _SetDataFixedFloat(XTensor * tensor, float p);
-void SetDataFixedFloat(XTensor * tensor, float p);
 /* generate data items with a fixed value p (in double) */
-extern "C"
+void _SetDataFixedDouble(XTensor * tensor, double p);
-void SetDataFixedDouble(XTensor * tensor, double p);
 /* generate data items with a uniform distribution in [low,high] */
-extern "C"
+void _SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
-void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high);
 /* generate data items with a normal distribution with specified mean and standard deviation */
-extern "C"
+void _SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
-void SetDataRandN(XTensor * tensor, DTYPE mean, DTYPE standardDeviation);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -20,6 +20,7 @@
 */
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Log.h"
 #include "Log.cuh"
 #include <math.h>
@@ -27,22 +28,55 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-set every entry to its log value
+set every entry to its log value (do it on site)
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Log(XTensor * a)
+void _Log(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaLog(a);
+        _CudaLog(a, b);
    return;
-}
+    }
 #endif
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = (DTYPE)log(d[i]);
+        db[i] = (DTYPE)log(d[i]);
+}
+/*
+set every entry to its log value
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _LogMe(XTensor * a)
+{
+    _Log(a, a);
+}
+/*
+set every entry to its log value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the log value of the input tensor
+*/
+XTensor Log(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* call _Log function */
+    _Log(&a, &b);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_LOG);
+    return b;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its log value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelLog(DTYPE * d, int size)
+void KernelLog(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = log(d[i]);
+        b[i] = log(a[i]);
 }
 /*
 set each entry to its log value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelLog(__half * d, int size)
+void KernelLog(__half * a, __half * b, int size)
 {
    return;
 }
 /*
 set each entry to its log value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaLog(XTensor * a)
+void _CudaLog(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];
@@ -74,10 +78,10 @@ void _CudaLog(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);
    if (a->dataType == DEFAULT_DTYPE) {
-        KernelLog << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelLog << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelLog << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelLog << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set each entry to its log value (CUDA Kernel) */
 __global__
-void KernelLog(DTYPE * d, int size);
+void KernelLog(DTYPE * a, DTYPE * b, int size);
 /* set each entry to its log value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelLog(__half * d, int size);
+void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its log value */
 extern "C"
-void _CudaLog(XTensor * a);
+void _CudaLog(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its log value */
-extern "C"
+void _Log(const XTensor * a, XTensor * b);
-void _Log(XTensor * a);
+/* 
+set every entry to its log value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _LogMe(XTensor * a);
+/* 
+set every entry to its log value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Log(const XTensor & a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -21,6 +21,7 @@
 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Power.h"
 #include "Power.cuh"
@@ -28,38 +29,73 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 get the power(a, p)
->> a - the tensor
+>> a - input tensor
->> p - as it is
+>> b - output tensor
+>> p - parameter
 */
-void _Power(XTensor * a, DTYPE p)
+void _Power(const XTensor * a, XTensor * b, DTYPE p)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaPower(a, p);
+        _CudaPower(a, b, p);
        return;
    }
 #endif
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * aData = (DTYPE*)a->data;
+    DTYPE * bData = (DTYPE*)b->data;
    if (p == 0) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)1.0;
+            bData[i] = (DTYPE)1.0;
    }
    else if (p == (DTYPE)0.5) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)sqrt(d[i]);
+            bData[i] = (DTYPE)sqrt(aData[i]);
    }
    else if (p == (DTYPE)2.0) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = d[i] * d[i];
+            bData[i] = aData[i] * aData[i];
    }
    else {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)pow(d[i], p);
+            bData[i] = (DTYPE)pow(aData[i], p);
    }
 }
+/*
+get the power(a, p) (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor
+>> p - parameter
+*/
+void _PowerMe(XTensor * a, DTYPE p)
+{
+    _Power(a, a, p);
+}
+/*
+get the power(a, p) (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor
+>> p - parameter
+<< return - the power value of the input tensor
+*/
+XTensor Power(const XTensor & a, DTYPE p)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    /* call _Power function */
+    _Power(&a, &b, p);
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_POWER);
+    XLink::AddParamToHead(&b, p);
+    return b;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -21,6 +21,7 @@
 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../movement/CopyValues.cuh"
 #include "Power.h"
 #include "Power.cuh"
@@ -30,74 +31,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 set all entries to its root (CUDA Kernel)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> size - size of the data array
 */
 __global__
-void KernelSqrtV2(DTYPE * d, int size)
+void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = sqrt(d[i]);
+        b[i] = sqrt(a[i]);
 }
 /*
 set all entries to its root (CUDA Kernel)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> size - size of the data array
 */
 __global__
-void KernelSqrtV2(__half * d, int size)
+void KernelSqrtV2(__half * a, __half * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    if (i < size)
-        d[i] = hsqrt(d[i]);
+        b[i] = hsqrt(a[i]);
 #else
    if (i < size)
-        d[i] = __float2half(sqrt(__half2float(d[i])));
+        b[i] = __float2half(sqrt(__half2float(a[i])));
 #endif
 }
 /*
 get power(d[i], p)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> p - power
 >> size - size of the data array
 */
 __global__
-void KernelPower(DTYPE * d, DTYPE p, int size)
+void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = pow(d[i], p);
+        b[i] = pow(a[i], p);
 }
 /*
 get power(d[i], p)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> p - power
 >> size - size of the data array
 */
 __global__
-void KernelPower(__half * d, __half p, int size)
+void KernelPower(__half * a, __half * b, __half p, int size)
 {
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = __float2half(pow(__half2float(d[i]), __half2float(p)));
+        b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
 #endif
 }
 /* get the power of the entries */
 extern "C"
-void _CudaPower(XTensor * a, DTYPE p)
+void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    int gridSize[3];
    int blockSize[3];
@@ -111,15 +118,18 @@ void _CudaPower(XTensor * a, DTYPE p)
    if (a->dataType == DEFAULT_DTYPE) {
        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+            KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
+        }
+        else if (p == (DTYPE)1.0) {
+            _CudaCopyValues(a, b);
        }
        else if (p != (DTYPE)1.0) {
-            KernelPower << <blocks, threads >> >((DTYPE*)a->data, p, a->unitNum);
+            KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
        }
    }
    else if (a->dataType == X_FLOAT16) {
        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, a->unitNum);
+            KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
        }
        else if (p != (DTYPE)1.0) {
            ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set all entries to its root (CUDA Kernel) */
 __global__
-void KernelSqrtV2(DTYPE * d, int size);
+void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);
 /* set all entries to its root (CUDA Kernel) */
 __global__
-void KernelSqrtV2(__half * d, int size);
+void KernelSqrtV2(__half * a, __half * b, int size);
 /* get the power of the entries */
 extern "C"
-void _CudaPower(XTensor * a, DTYPE p);
+void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get the power(x, y) */
-extern "C"
+void _Power(const XTensor * a, XTensor * b, DTYPE p);
-void _Power(XTensor * a, DTYPE p);
+/* 
+get the power(x, y) (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _PowerMe(XTensor * a, DTYPE p);
+/* 
+get the power(x, y) (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Power(const XTensor & a, DTYPE p);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -110,8 +110,7 @@ make a new tensor to keep the result and return it
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {
-    CheckNTErrors(&s, "Empty input tensor!");
+    CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");
-    CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
    int order = s.order;
    int * dimSize = new int[order];
@@ -123,16 +122,13 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
            dimSize[i] = s.dimSize[i];
    }
-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
-    t.SetZeroAll();
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMP();
    /* call _CopyIndexed function */
    _CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
-    /* destroy variables */
-    delete[] dimSize;
    /* tensor connection */
    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
    XLink::AddParamToHeadInt(&t, dim);
@@ -140,7 +136,10 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
    XLink::AddParamToHeadInt(&t, indexSize);
    XLink::AddParamToHeadPointer(&t, tgtIndex);
    XLink::AddParamToHeadInt(&t, copyNum);
+    /* destroy variables */
+    delete[] dimSize;
    return t;
 }

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -101,32 +101,31 @@ make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }
-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    output.SetZeroAll();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();
    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);
-    /* destroy variables */
-    delete[] dimSize;
    /* tensor connection */
    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
    XLink::AddParamToHeadInt(&output, dim);
+    /* destroy variables */
+    delete[] dimSize;
    return output;
 }

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -58,20 +58,19 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
 XTensor ReduceMean(const XTensor &input, int dim)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }
-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    output.SetZeroAll();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();
    /* call _ReduceMean function */
@@ -87,4 +86,4 @@ XTensor ReduceMean(const XTensor &input, int dim)
    return output;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -214,20 +214,19 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }
-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    output.SetZeroAll();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();
    /* call _ReduceSum function */
@@ -237,6 +236,53 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, power);
+    XLink::AddParamToHeadBool(&output, isExp);
+    /* destroy variables */
+    delete[] dimSize;
+    return output;
+}
+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a,
+sum = \sum_i (a_i)^power if isExp == false
+sum = \sum_i exp((a_i)^power) if isExp == true
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+>> ieExp - specify if the exp() is performed
+>> power - we perform pow(item_i, power) on each item in the array
+<< return - the sum along a dimension of the tensor
+*/
+XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i >= dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
+    output.SetTMP();
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, NULL, power, isExp);
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
+    XLink::AddParamToHeadInt(&output, dim);
+    XLink::AddParamToHead(&output, power);
+    XLink::AddParamToHeadBool(&output, isExp);
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -43,7 +43,16 @@ For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
-XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
+XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a,
+sum = \sum_i (a_i) if isExp == false
+sum = \sum_i exp(a_i) if isExp == true
+*/
+XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -54,25 +54,24 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 */
 XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }
-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    output.SetZeroAll();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();
    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);
    /* tensor connection */
    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
    XLink::AddParamToHeadInt(&output, dim);
@@ -83,4 +82,4 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
    return output;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
+#include "../../XName.h"
 #include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceVariance.h"
@@ -56,24 +57,27 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }
-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
-    output.SetZeroAll();
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();
    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);
+    /* tensor connection */
+    XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
+    XLink::AddParamToHeadInt(&output, dim);
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -68,8 +68,7 @@ or "Merge" by means of the tensor shapes
 */
 XTensor Concatenate(const XList &smalls, int dim)
 {
-    CheckNTErrors(&smalls != NULL, "Invalid list!");
+    CheckNTErrors(smalls.count > 0, "Empty list!");
-    CheckNTErrors((smalls.count > 0), "Empty list!");
    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
    bool uniform = true;
@@ -80,40 +79,35 @@ XTensor Concatenate(const XList &smalls, int dim)
        if (!XTensor::IsIdentical(a, b))
            uniform = false;
    }
+    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    int order = tensor->order;
+    int * dimSize = new int[order];
-    int * dimSize;
    if (uniform) {
-        XTensor * tensor = (XTensor*)smalls.GetItem(0);
-        int order = tensor->order;
-        dimSize = new int[order];
        for (int i = 0; i < tensor->order; i++) {
            if (i != dim)
                dimSize[i] = tensor->dimSize[i];
            else
                dimSize[i] = tensor->dimSize[dim] * smalls.count;
        }
-        XTensor big = XTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-        big.SetZeroAll();
+        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
        big.SetTMP();
        /* call _Merge function */
        _Merge(&smalls, &big, dim);
-        ///* tensor connection */
+        /* tensor connection */
-        //XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+        XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
-        //XLink::AddParamToHead(&big, dim);
+        XLink::AddParamToHeadInt(&big, dim);
-        /* destroy variables */
-        delete dimSize;
+        /* destroy variables */
+        delete[] dimSize;
        return big;
    }
    else {
-        XTensor * tensor = (XTensor*)smalls.GetItem(0);
-        int order = tensor->order;
-        dimSize = new int[order];
        for (int i = 0; i < tensor->order; i++)
            if (i != dim)
                dimSize[i] = tensor->dimSize[i];
@@ -125,15 +119,19 @@ XTensor Concatenate(const XList &smalls, int dim)
        }
        dimSize[dim] = catDimSize;
-        XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-        big.SetZeroAll();
+        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
        big.SetTMP();
        /* call _ConcatenateSolely function */
        _ConcatenateSolely(&smalls, &big, dim);
+        /* tensor connection */
+        XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+        XLink::AddParamToHeadInt(&big, dim);
        /* destroy variables */
-        delete dimSize;
+        delete[] dimSize;
        return big;
    }
@@ -168,12 +166,76 @@ make a new tensor to keep the result and return it.
 */
 XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
 {
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
    XList smalls(2);
    smalls.Add(&smallA);
    smalls.Add(&smallB);
-    /* call Concatenate function */
+    bool uniform = true;
-    return Concatenate(smalls, dim);
+    for (int i = 1; i < smalls.count; i++) {
+        XTensor * a = (XTensor*)smalls.Get(i - 1);
+        XTensor * b = (XTensor*)smalls.Get(i);
+        CheckNTErrors((a && b), "Empty input tensors!");
+        if (!XTensor::IsIdentical(a, b))
+            uniform = false;
+    }
+    XTensor * tensor = (XTensor*)smalls.Get(0);
+    int order = tensor->order;
+    int * dimSize = new int[order];
+    if (uniform) {
+        for (int i = 0; i < tensor->order; i++) {
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+            else
+                dimSize[i] = tensor->dimSize[dim] * smalls.count;
+        }
+        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        big.SetTMP();
+        /* call _Merge function */
+        _Merge(&smalls, &big, dim);
+        /* tensor connection */
+        XLink::MakeLink(&smalls, &big, SHAPE_MERGE);
+        XLink::AddParamToHeadInt(&big, dim);
+        /* destroy variables */
+        delete[] dimSize;
+        return big;
+    }
+    else {
+        for (int i = 0; i < tensor->order; i++)
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+        int catDimSize = 0;
+        for (int i = 0; i < smalls.count; i++) {
+            XTensor * tensor = (XTensor*)smalls.Get(i);
+            catDimSize += tensor->dimSize[dim];
+        }
+        dimSize[dim] = catDimSize;
+        float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+        XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+        big.SetTMP();
+        /* call _ConcatenateSolely function */
+        _ConcatenateSolely(&smalls, &big, dim);
+        /* tensor connection */
+        XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+        XLink::AddParamToHeadInt(&big, dim);
+        /* destroy variables */
+        delete[] dimSize;
+        return big;
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -36,7 +36,7 @@ concatenate a list of tensors along a given dimension
 */
 void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
 {
-    CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
+    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");
    int catDimSize = 0;
    int dimRDI = big->order - dim - 1;

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
@@ -36,7 +36,7 @@ set target data block index for the data movement in merge
 >> mem - the memory pool
 */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                         int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
+                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
 {
    if (mem != NULL && mem->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
@@ -40,7 +40,7 @@ set target data block index for the data movement in split (device code)
 */
 __global__
 void KernelMakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-        int splitSizeInGrid, int gridSize, int gridNum)
+                               int splitSizeInGrid, int gridSize, int gridNum)
 {
    /* block index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -71,8 +71,8 @@ set target data block index for the data movement in split
 */
 extern "C"
 void _CudaMakeMergeBlockIndex(int devID,
-                             int * blockIndex, int blockNum, int blockNumInMerge,
+                              int * blockIndex, int blockNum, int blockNumInMerge,
-                             int splitSizeInGrid, int gridSize, int gridNum)
+                              int splitSizeInGrid, int gridSize, int gridNum)
 {
    int cudaGrids[3];
    int cudaBlocks[3];

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
@@ -30,9 +30,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set target data block index for the data movement in split */
 extern "C"
-void _CudaMakeMergeBlockIndex(int devID,
+void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
-    int * blockIndex, int blockNum, int blockNumInMerge,
+                              int splitSizeInGrid, int gridSize, int gridNum);
-    int splitSizeInGrid, int gridSize, int gridNum);
 #endif // USE_CUDA

--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set target data block index for the data movement in merge */
 extern "C"
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-    int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
+                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -161,8 +161,7 @@ e.g., (N/3, M, 3) -> (N, M)
 */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
 {
-    CheckNTErrors(&s != NULL, "Invalid tensors!");
+    CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
-    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
    if (leadingDim < 0)
 		leadingDim = 0;
@@ -180,13 +179,18 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
        }
    }
-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
-    t.SetZeroAll();
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMP();
    /* call _Merge function */
    _Merge(&s, &t, whereToMerge, leadingDim);
+    /* tensor connections */
+    XLink::MakeLink(&s, NULL, &t, SHAPE_MERGE);
+    XLink::AddParamToHeadInt(&t, whereToMerge);
+    XLink::AddParamToHeadInt(&t, leadingDim);
    /* destroy variables */
    delete[] dimSize;
@@ -327,13 +331,58 @@ XTensor Merge(const XList &smalls, int whereToMerge)
            dimSize[i] = tensor->dimSize[whereToMerge] * smalls.count;
    }
-    XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+    float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
-    big.SetZeroAll();
+    XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
+    big.SetTMP();
+    /* call _Merge function */
+    _Merge(&smalls, &big, whereToMerge);
+    /* tensor connections */
+    XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
+    XLink::AddParamToHeadInt(&big, whereToMerge);
+    /* destroy variables */
+    delete[] dimSize;
+    return big;
+}
+/* 
+merge two tensors into a big tensor (return a XTensor structure) 
+>> smalls - the list of the small tensors
+>> whereToMerge - the merging operation is along with which dimension
+<< return - the big tensor merged by small tensors
+*/
+XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
+{
+    CheckNTErrors(XTensor::IsIdentical(&smallA, &smallB), 
+                 "The two tensors must be of the same size!");
+    int order = smallA.order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < smallA.order; i++) {
+        if (i != whereToMerge)
+            dimSize[i] = smallA.dimSize[i];
+        else
+            dimSize[i] = smallA.dimSize[whereToMerge] * 2;
+    }
+    float dr = (!smallA.isSparse) ? 1.0F : smallA.denseRatio;
+    XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
    big.SetTMP();
+    XList smalls(2);
+    smalls.Add(&smallA);
+    smalls.Add(&smallB);
    /* call _Merge function */
    _Merge(&smalls, &big, whereToMerge);
+    /* tensor connections */
+    XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
+    XLink::AddParamToHeadInt(&big, whereToMerge);
    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -29,22 +29,19 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
-/* 
+/* transform a tensor by merging it alone with a dimension (return a XTensor structure)
-transform a tensor by merging it alone with a dimension (return a XTensor structure).
+   e.g., (M, N/3, 3) -> (M, N) */
-make a new tensor to keep the result and return it.
-e.g., (M, N/3, 3) -> (M, N) 
-*/
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);
 /* merge small tensors into a big tensor */
 void _Merge(const XList * smalls, XTensor * big, int whereToMerge);
-/* 
+/* merge small tensors into a big tensor (return a XTensor structure) */
-merge small tensors into a big tensor (return a XTensor structure).
-make a new tensor to keep the result and return it.
-*/
 XTensor Merge(const XList &smalls, int whereToMerge);
+/* merge two tensors into a big tensor (return a XTensor structure) */
+XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MERGE_H__
\ No newline at end of file
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -19,10 +19,12 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../../XTensor.h"
-#include "../../XUtility.h"
 #include "Split.h"
 #include "MakeSplitBlockIndex.h"
+#include "../../XName.h"
+#include "../../XTensor.h"
+#include "../../XUtility.h"
 #include "../movement/CopyBlocksOnSite.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -146,20 +148,25 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    int order = s.order + 1;
    int * dimSize = new int[order];
+    dimSize[0] = splitNum;
    for (int i = 0; i < s.order; i++) {
        if (i == whereToSplit)
-            dimSize[i] = s.dimSize[i] / splitNum;
+            dimSize[i+1] = s.dimSize[i] / splitNum;
        else
-            dimSize[i] = s.dimSize[i];
+            dimSize[i+1] = s.dimSize[i];
    }
-    dimSize[-1] = splitNum;
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
-    t.SetZeroAll();
    t.SetTMP();
    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);
+    /* tensor connections */
+    XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
+    XLink::AddParamToHeadInt(&t, whereToSplit);
+    XLink::AddParamToHeadInt(&t, splitNum);
    /* destroy variables */
    delete[] dimSize;
@@ -168,7 +175,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
 }
 /*
-split a big tensor into small tensors.
+split a big tensor into small tensors
 >> big - the source tensor
 >> smalls - the list that keeps the resulting tensors (for return)
@@ -274,43 +281,29 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
 }
 /*
-split a big tensor into small tensors (returna a XList struture).
+split a big tensor into small tensors
-make a new list to keep the result and return it.
 >> big - the source tensor
+>> smalls - the list that keeps the resulting tensors (for return)
+   NOTE that all the "small" tensors have already been placed in the list in advance.
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
-<< return - a list of small tensors by splitting a big tensor
 */
-XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
+void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
 {
-    CheckNTErrors(&big, "Invalid tensors!");
-    XList smalls = XList(splitNum);
-    int order = big.order;
-    int * dimSize = new int[order];
-    for (int i = 0; i < big.order; i++) {
-        if (i != whereToSplit)
-            dimSize[i] = big.dimSize[i];
-        else
-            dimSize[i] = big.dimSize[i] / splitNum;
-    }
-    for (int i = 0; i < splitNum; i++) {
-        XTensor tensor = NewTensor(order, dimSize, big.dataType, big.denseRatio, big.devID, big.mem);
-        tensor.SetZeroAll();
-        tensor.SetTMP();
-        smalls.Add(&tensor);
-    }
    /* call _Split function */
    _Split(&big, &smalls, whereToSplit, splitNum);
-    /* destroy variables */
+    /* tensor connections */
-    delete[] dimSize;
+    for(int i = 0; i < smalls.count; i++){
+        XTensor * s = (XTensor*)smalls.Get(i);
-    return smalls;
+        XLink::MakeLink(&big, NULL, s, SHAPE_SPLIT_LIST);
+        XLink::AddParamToHeadInt(s, whereToSplit);
+        /* it is tricky here that we keep the id of each 
+           block, rather than the total number of splits */
+        XLink::AddParamToHeadInt(s, i);
+    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Loss.cuh
+++ b/source/tensor/function/Loss.cuh
--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp