merge with xu

2837e09f · xuchen · 087efa88 · 394e8340 · 2837e09f · 2837e09f
Commit 2837e09f authored Jul 16, 2018 by xuchen
--- a/doc/manual.md
+++ b/doc/manual.md
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -53,8 +53,8 @@ int main( int argc, const char ** argv )
    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    //else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        FNNLMMain(argc - 1, argv + 1);
+    //    FNNLMMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -82,7 +82,7 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
 }
 /* 
-data conversion
+data type conversion
 >> devID - device id
 >> s - source data array
 >> typeS - source data type
@@ -92,7 +92,7 @@ data conversion
 */
 void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
 {
-    CheckNTErrors((devID < 0), "This code must be run on GPUs!");
+    CheckNTErrors((devID < 0), "This code must be run on CPUs!");
    if(typeS == typeT)
        return;

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -229,6 +229,7 @@ void XLink::AddParam(void * param, int size)
    paramNum++;
    delete[] (char*)ps;
 }
 /* 
 create a hyperedge with two input tensors and a output tensor 
 >> t1 - a tail tensor
@@ -254,7 +255,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(XList * list, XTensor * h, int id)
+void XLink::MakeLink(const XList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -307,6 +308,43 @@ void XLink::AddParamToHeadInt(XTensor * h, int param)
 }
 /* 
+add a MATRIX_TRANS_TYPE parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(MATRIX_TRANS_TYPE));
+}
+/* 
+add a boolean parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadBool(XTensor * h, bool param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(bool));
+}
+/* 
+add a pointer parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadPointer(XTensor * h, void * param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(param));
+}
+/* 
 replace a node with another, i.e., we redirect the links to the new node 
 >> oldOne - the node to be replaced
 >> newOne - the new node

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -127,7 +127,7 @@ struct XLink
    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(XList * list, XTensor * h, int id);
+    void MakeLink(const XList * list, XTensor * h, int id);
    /* add a parameter */
    static
@@ -137,6 +137,18 @@ struct XLink
    static
    void AddParamToHeadInt(XTensor * h, int param);
+    /* add a MATRIX_TRANS_TYPE parameter */
+    static
+    void AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param);
+    /* add a boolean parameter */
+    static
+    void AddParamToHeadBool(XTensor * h, bool param);
+    /* add a pointer parameter */
+    static
+    void AddParamToHeadPointer(XTensor * h, void * param);
    /* replace a node with another, i.e., we redirect the links to the new node */
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -206,7 +206,7 @@ void XList::Insert(int pos, void * item)
 }
 /* get the item at position i */
-void * XList::GetItem(int i)
+void * XList::GetItem(int i) const
 {
    if( i >= 0 && i < count )
        return items[i];

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -74,7 +74,7 @@ public:
    void AddList(XList * l);
    void AddInt(int i);
    void Insert(int pos, void * item);
-    void * GetItem(int i);   
+    void * GetItem(int i) const;   
    int GetItemInt(int i);
    void SetItem(int i, void * item);
    void SetItemInt(int i, int item);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -27,12 +27,56 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 const char * GetOPName(int type)
 {
    if((type & MATH_ARITHMETIC) != 0){
-        if(type == MATH_SUM)
+        if(type == MATH_ABSOLUTE)
-            return "M_SUM";
+            return "M_ABSOLUTE";
+        else if(type == MATH_MATRIXMUL)
+            return "M_MATRIXMUL";
+        else if(type == MATH_MATRIXMULBATCHED)
+            return "M_MATRIXMULBATCHED";
        else if(type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if(type == MATH_NEGATE)
+            return "M_NEGATE";
+        else if(type == MATH_SIGN)
+            return "M_SIGN";
+        else if(type == MATH_SUM)
+            return "M_SUM";
+        else if(type == MATH_LOG)
+            return "M_NORMALIZE";
+        else if(type == MATH_NORMALIZE)
+            return "M_LOG";
+        else if(type == MATH_POWER)
+            return "M_POWER";
        else if(type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if(type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if(type == MOVEMENT_COPYINDEXED)
+            return "M_COPYINDEXED";
+        else if(type == MOVEMENT_COPYVALUES)
+            return "M_COPYVALUES";
+        else if(type == REDUCE_REDUCEMAX)
+            return "R_REDUCEMAX";
+        else if(type == REDUCE_REDUCEMEAN)
+            return "R_REDUCEMEAN";
+        else if(type == REDUCE_REDUCESUM)
+            return "R_REDUCESUM";
+        else if(type == REDUCE_REDUCESUMSQUARED)
+            return "R_REDUCESUMSQUARED";
+        else if(type == REDUCE_REDUCEVARIANCE)
+            return "R_REDUCEVARIANCE";
+        else if(type == SHAPE_CONCATENATE)
+            return "S_CONCATENATE";
+        else if(type == SHAPE_MERGE)
+            return "S_MERGE";
+        else if(type == SHAPE_PERMUTE)
+            return "S_PERMUTE";
+        else if(type == SHAPE_SPLIT)
+            return "S_SPLIT";
+        else if(type == SHAPE_TRANSPOSE)
+            return "S_TRANSPOSE";
+        else if(type == SHAPE_UNSQUEEZE)
+            return "S_UNSQUEEZE";
    }
    return "NULL";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -29,9 +29,40 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #define MATH_ARITHMETIC     0x00001000
-#define MATH_SUM            MATH_ARITHMETIC + 1
+#define MATH_ABSOLUTE           MATH_ARITHMETIC + 1
-#define MATH_MULTIPLY       MATH_SUM + 1
+#define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
-#define MATH_SCALEANDSHIFT  MATH_MULTIPLY + 1
+#define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
+#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_NEGATE             MATH_MULTIPLY + 1
+#define MATH_SIGN               MATH_NEGATE + 1
+#define MATH_SUM                MATH_SIGN + 1
+#define MATH_LOG                MATH_SUM + 1
+#define MATH_NORMALIZE          MATH_LOG + 1
+#define MATH_POWER              MATH_NORMALIZE + 1
+#define MATH_SCALEANDSHIFT      MATH_POWER + 1
+#define GETANDSET               MATH_SCALEANDSHIFT + 1
+#define GETANDSET_SELECT        GETANDSET + 1
+#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
+#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
+#define REDUCE                  MOVEMENT_COPYVALUES + 1
+#define REDUCE_REDUCEMAX        REDUCE + 1
+#define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
+#define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
+#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
+#define SHAPE                   REDUCE_REDUCEVARIANCE + 1
+#define SHAPE_CONCATENATE       SHAPE + 1
+#define SHAPE_MERGE             SHAPE_CONCATENATE + 1
+#define SHAPE_PERMUTE           SHAPE_MERGE + 1
+#define SHAPE_SPLIT             SHAPE_PERMUTE + 1
+#define SHAPE_TRANSPOSE         SHAPE_SPLIT + 1
+#define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1
 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -173,7 +173,7 @@ XTensor::XTensor(const XTensor &reference)
        devID = reference.devID;
        mem = reference.mem;
        InitTensor(this, &reference);
-        CopyValues(&reference, this);
+        _CopyValues(&reference, this);
    }
    if(reference.isTmp)
@@ -300,7 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        }
        Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
-        CopyValues(&tensor, this);
+        _CopyValues(&tensor, this);
    }
    /* copy member variables */
@@ -345,7 +345,7 @@ judge whether the two matrices are in the same type and size
 >> b - anther tensor to compare with
 << return - whether the two input tensors are identical
 */
-bool XTensor::IsIdentical(XTensor * a, XTensor * b)
+bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
 {
    if(a->order != b->order)
        return false;
@@ -427,7 +427,7 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize)
 }
 /* get the number of items in the data array */
-int XTensor::GetSize()
+int XTensor::GetSize() const
 {
    if(isSparse)
        return unitNumNonZero;
@@ -743,7 +743,7 @@ get the pointer to a cell
 >> size - size of index
 << return - pointer to the cell
 */
-void * XTensor::GetCell(int index[], int size)
+void * XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");
@@ -795,7 +795,7 @@ get the value of a cell in a 2d tensor in default type
 >> mi - column index
 << return - value of cell(ni, mi) in float
 */
- DTYPE XTensor::Get2D(int ni, int mi)
+DTYPE XTensor::Get2D(int ni, int mi) const
 {
    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
@@ -1243,7 +1243,7 @@ binary search to find an element in a sparse tensor
              it is the previous one if there is no hit
 << return - find it or not?
 */
-bool XTensor::BinarySearch(int key, DTYPE &value, void * &position)
+bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
 {
    CheckNTErrors((isSparse), "A sparse tensor is required!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -201,7 +201,7 @@ public:
    /* judge whether the two matrices are in the same type and size */
    static
-    bool IsIdentical(XTensor * a, XTensor * b);
+    bool IsIdentical(const XTensor * a, const XTensor * b);
    /* judge whether the three matrices are in the same type and size */
    static
@@ -217,7 +217,7 @@ public:
    void Reshape(const int order, const int * myDimSize);
    /* get the number of items in the data array */
-    int GetSize();
+    int GetSize() const;
    /* get size of the memory used */
    int GetDataSizeInChar();
@@ -253,13 +253,13 @@ public:
    DTYPE Get(int index[], int size = -1);
    /* get the pointer to a cell */
-    void * GetCell(int index[], int size = -1);
+    void * GetCell(int index[], int size = -1) const;
    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i);
    /* get the default type value of a cell in a 2d tensor */
-    DTYPE Get2D(int ni, int mi);
+    DTYPE Get2D(int ni, int mi) const;
    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);
@@ -314,7 +314,7 @@ public:
    bool Resize(const XTensor * myTensor);
    /* binary search to find an element in a sparse matrix*/
-    bool BinarySearch(int key, DTYPE &value, void * &position);
+    bool BinarySearch(int key, DTYPE &value, void * &position) const;
    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -19,6 +19,7 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */
+#include <math.h>
 #include "../../XTensor.h"
 #include "Absolute.h"
 #include "Absolute.cuh"
@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its absolute value
 >> a - the tensor we are processing
 */
-void Absolute(XTensor * a)
+void _Absolute(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaAbsolute(a);
+        _CudaAbsolute(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -58,7 +58,7 @@ set each entry to its  with float16 data type value
 >> a - the tensor
 */
 extern "C"
-void CudaAbsolute(XTensor * a)
+void _CudaAbsolute(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");

--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -35,7 +35,7 @@ void KernelAbsolute(__half * d, int size);
 /* set each entry to its absolute value */
 extern "C"
-void CudaAbsolute(XTensor * a);
+void _CudaAbsolute(XTensor * a);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its absolute value */
 extern "C"
-void Absolute(XTensor * a);
+void _Absolute(XTensor * a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
@@ -37,8 +37,8 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                         XList * b, MATRIX_TRANS_TYPE transposedB,
+                         const XList * b, MATRIX_TRANS_TYPE transposedB,
                         XList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");
@@ -73,11 +73,11 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
 #ifdef USE_BLAS
        if (useBLAS)
-            MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
        else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
    }
    //}

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication in batch mode (CPU code) */
 extern "C"
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -30,34 +30,34 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-matrix multiplication. For the input tensors a and b, we perform matrix multiplication
+matrix multiplication
-on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
-a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y)
+For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
-and each order-2 tensor B (of size z * x), like this
+E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
-c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
-where trans() returns the transposed matrix if the flag is fired, ai is the i-th
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
-element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
-tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
-normal matrix multiplication if A = y * z and B = x * y.
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
 >> transposedB - indicates whether teh matrices in b are transposed
->> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-               XTensor * b, MATRIX_TRANS_TYPE transposedB,
+               const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-               XTensor * c, DTYPE alpha, DTYPE beta,
+               XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
-               XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
-                  "Input tensors must have a order > 2!");
+                  "Input tensors must have a order >= 2!");
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -132,7 +132,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            XTensor * ai = (XTensor*)aList->GetItem(i);
            XTensor * bi = (XTensor*)bList->GetItem(i);
            XTensor * ci = (XTensor*)cList->GetItem(i);
-            MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
        }
    }
    else if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
@@ -144,7 +144,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        ProtectCudaDev(a->devID, devIDBackup);
        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        CudaBLASMatrixMULList(handle,
+        _CudaBLASMatrixMULList(handle,
                              aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
@@ -157,7 +157,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMULBatchedCPU(aList, transposedA,
            bList, transposedB,
            cList, alpha, beta);
    }
@@ -184,4 +184,74 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete bList;
    delete cList;
 }
+/*
+matrix multiplication (return a XTensor structure)
+make a new tensor to keep the result and return it
+For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
+The result C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> beta - another coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                 DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+{
+    CheckNTErrors((&a && &b), "Empty input tensors!");
+    CheckNTErrors((a.dataType == b.dataType), "Input tensors should have the same data type!");
+    CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
+    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+    int order = a.order + b.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a.order; i++)
+        dimSize[sub++] = a.dimSizeRDI[i];
+    for (int i = 2; i < b.order; i++)
+        dimSize[sub++] = b.dimSizeRDI[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+    /* call _MatrixMul function */
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+    XLink::AddParamToHeadTrans(&c, transposedA);
+    XLink::AddParamToHeadTrans(&c, transposedB);
+    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, beta);
+    /* destroy variables */
+    delete dimSize;
+    return c;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -27,20 +27,36 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-matrix multiplication. For the input tensors a and b, we perform matrix multiplication
+matrix multiplication
-on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
-a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y)
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
-and each order-2 tensor B (of size z * x), like this
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
-c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
-where trans() returns the transposed matrix if the flag is fired, ai is the i-th
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
-element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
-tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
-normal matrix multiplication if A = y * z and B = x * y.
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-extern "C"
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+/* 
+matrix multiplication (return a XTensor structure)
+make a new tensor c to keep the result and return it
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+*/
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                 DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -30,8 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 matrix multiplication (for 2d tensors)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -42,8 +44,8 @@ where trans() return the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 >> stream - the string for creating the job pipeline
 */
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                 XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                 const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                 XTensor * c, DTYPE alpha, DTYPE beta,
                 XPRunner * parallelRunner, XStream * stream)
 {
@@ -67,7 +69,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
        return;
    }
 #endif
@@ -81,9 +83,9 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            c->dataType == DEFAULT_DTYPE)
        {
            if (useBLAS)
-                MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
+                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
            else
-                MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
        }
        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -108,8 +108,10 @@ void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, in
 /*
 matrix multiplication (for 2d tensors) (cuda version)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -119,8 +121,8 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> stream - the string for creating the job pipeline
 */
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                     XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                     const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                     XTensor * c,
                     DTYPE alpha, DTYPE beta, XStream * stream)
 {
@@ -156,7 +158,7 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            cublasSetStream(*handle, stream->stream);
        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-            CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
+            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
                a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
                alpha, beta);
        }

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 extern "C"
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -31,9 +31,8 @@ matrix multiplication (for 2d tensors)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
-extern "C"
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+                 DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,7 +38,7 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(XList * args)
 {
    int x1 = *(int*)args->GetItem(0);
    int y1 = *(int*)args->GetItem(1);

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -31,7 +31,7 @@ matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
 extern "C"
-void MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(XList * args);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication (for 2d tensors) with multi-threading
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -39,10 +40,9 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                         XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                         const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                         XTensor * c, DTYPE alpha, DTYPE beta,
+                         XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
-                         XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
@@ -56,7 +56,7 @@ void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    /* a * b */
    if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
-        RunParallel2D(parallelRunner, (void*)MatrixMul2DMultiTheading, an * am * bm,
+        RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
                      cn, cm, 5,
                      a, b, c, &alpha, &beta);
    }

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
@@ -27,12 +27,12 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-matrix multiplication (for 2d tensors) with multi-threading
+matrix multiplication (for 2d tensors) with multi-threading.
 c = trans(a) * trans(b) * alpha + c * beta
-where trans() return the transposed matrix if the flag is fired
+where trans() return the transposed matrix if the flag is fired.
 */
 extern "C"
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -30,10 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 matrix multiplication of the two tensors
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -43,8 +45,8 @@ where trans() returns the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                      XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                      XTensor * c, DTYPE alpha, DTYPE beta,
                      XPRunner * parallelRunner)
 {
@@ -52,7 +54,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
-                  "Input tensors must have a order > 2!");
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -109,7 +113,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        ProtectCudaDev(a->devID, devIDBackup);
        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        CudaBLASMatrixMULList(handle,
+        _CudaBLASMatrixMULList(handle,
 							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
@@ -122,7 +126,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMULBatchedCPU(aList, transposedA,
            bList, transposedB,
            cList, alpha, beta);
    }
@@ -150,4 +154,65 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }
+/*
+matrix multiplication of the two tensors (do it on site)
+make a new tensor to keep the result and return it
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired.
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> beta - another coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication of the two tensors
+*/
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                        DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+{
+    CheckNTErrors((&a && &b), "Empty input tensors!");
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
+    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
+    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+    int order = a.order;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a.order; i++)
+        dimSize[sub++] = a.dimSizeRDI[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+    /*call _MatrixMulBatched function */
+    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+    XLink::AddParamToHeadTrans(&c, transposedA);
+    XLink::AddParamToHeadTrans(&c, transposedB);
+    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, beta);
+    /* destroy variables */
+    delete dimSize;
+    return c;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -28,13 +28,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 matrix multiplication of the two tensors
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+*/
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+    XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+/*
+matrix multiplication of the two tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 */
-extern "C"
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -28,14 +28,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item
 >> a - matrix a
 >> b - matrix b
 >> c - result matrix
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->>
 */
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
@@ -121,9 +122,12 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
 }
 /*
-element-wise product of two tensors and keep the result in the input
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
 a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the item
 >> a - tensor a (where keep the result)
 >> b - tensor b
 >> alpha - the coefficient
@@ -135,9 +139,12 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }
 /*
-make a tensor of the element-wise product for two input tensors: 
+element-wise product of two tensors (return a XTensor structure)
+make a new tensor c to keep the result and return it
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item
 >> a - tensor a
 >> b - tensor b
 >> alpha - the coefficient
@@ -151,7 +158,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    XTensor c(&a);
    c.SetTMP();
-    /* computation */
+    /* call _Multiply function */
    _Multiply(&a, &b, &c, alpha, leadingDim);
    /* tensor connections */

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -26,19 +26,27 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* element-wise product of two tensors: 
+/* 
-   c(i) = a(i)*b(i) + \alpha * c(i) 
+element-wise product of two tensors:
-   where i is the index of the element */
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
-/* element-wise product of two tensors and keep the result in the input tensor: 
+/* 
-   a(i) = a(i)*b(i) + \alpha * a(i) 
+element-wise product of two tensors (do it on site)
-   where i is the index of the element */
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i) 
+where i is the index of the element 
+*/
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);
-/* make a tensor of the element-wise product for two input tensors: 
+/* 
-   c(i) = a(i)*b(i) + \alpha * c(i) 
+element-wise product of two tensors (return a XTensor structure)
-   where i is the index of the element */
+make a new tensor to keep the result and return it
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element 
+*/
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its minus value
 >> a - the tensor we are processing
 */
-void Negate(XTensor * a)
+void _Negate(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaNegate(a);
+        _CudaNegate(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -66,7 +66,7 @@ set each entry to its negtive value
 >> a - the tensor
 */
 extern "C"
-void CudaNegate(XTensor * a)
+void _CudaNegate(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");

--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
@@ -19,6 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
+#ifndef __NEGATE_CUH__
+#define __NEGATE_CUH__
 #include "Negate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelNegate(__half * d, int size);
 /* set each entry to its negtive value */
 extern "C"
-void CudaNegate(XTensor * a);
+void _CudaNegate(XTensor * a);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its minus value */
 extern "C"
-void Negate(XTensor * a);
+void _Negate(XTensor * a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its sign value
 >> a - the tensor we are processing
 */
-void Sign(XTensor * a)
+void _Sign(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaSign(a);
+        _CudaSign(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -64,7 +64,7 @@ set each entry to its  with float16 data type value
 >> a - the tensor
 */
 extern "C"
-void CudaSign(XTensor * a)
+void _CudaSign(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");

--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */
+#ifndef __SIGN_CUH__
+#define __SIGN_CUH__
 #include "Sign.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelSign(__half * d, int size);
 /* set each entry to its sign value */
 extern "C"
-void CudaSign(XTensor * a);
+void _CudaSign(XTensor * a);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its sign value */
 extern "C"
-void Sign(XTensor * a);
+void _Sign(XTensor * a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 tensor summation c = a + b * \beta
-return a pointer
 >> a - a tensor
 >> b - another tensor
 >> c - where we put a+b*\beta. we save it in a if c is NULL
@@ -112,8 +112,9 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 }
 /*
-tensor summation a = a + b * \beta
+tensor summation a = a + b * \beta (do it on site)
-do it on site
+keep the result in the tensor a and return nothing
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
@@ -124,18 +125,20 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }
 /*
-tensor summation a = a + b * \beta
+tensor summation c = a + b * \beta (return a XTensor structure)
-return a XTensor structure
+make a new tensor c to keep the result and return it
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
+<< return - the result of tensor summation
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
    c.SetTMP();
-    /* computation */
+    /* call _Sum function */
    _Sum(&a, &b, &c, beta);
    /* tensor connections */

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -29,10 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* tensor summation c = a + b * \beta */
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
-/* tensor summation a = a + b * \beta (return a pointer) */
+/* 
+tensor summation a = a + b * \beta
+keep the result in the input tensor a and return nothing
+*/
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
-/* tensor summation c = a + b * \beta (return a structure) */
+/*
+tensor summation c = a + b * \beta
+make a new tensor c to keep the result and return it
+*/
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
@@ -37,11 +37,8 @@ where b is a vector.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
@@ -56,7 +53,7 @@ void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-        CudaSumByColumnTV(a, b, c, beta);
+        _CudaSumByColumnTV(a, b, c, beta);
 #endif
    }
    else {

--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
@@ -64,11 +64,8 @@ where b is a vector.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),

--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* summation of a tensor and a vector (column vector) */
 extern "C"
-void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a tensor and a (column) vector */
 extern "C"
-void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
@@ -37,11 +37,8 @@ where c and a are vectors, and b_col is a column in b.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
@@ -49,7 +46,7 @@ void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-        CudaSumByColumnVT(a, b, c, beta);
+        _CudaSumByColumnVT(a, b, c, beta);
 #endif
    }
    else {

--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
@@ -80,11 +80,8 @@ where c and a are vectors, and b_col is a column in b.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),

--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* summation of a vector (column vector) and a tensor */
 extern "C"
-void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a (column) vector and a tensor */
 extern "C"
-void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -36,9 +36,9 @@ c = trans(a) * trans(b) * \alpha + c * \beta
 >> beta - scalar
 >> c - output matrix (2d tensor)
 */
-void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA,
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-    XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-    XTensor * c, DTYPE alpha, DTYPE beta)
+                  XTensor * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMUL(cublasHandle_t * handle,
+void _CudaBLASMatrixMUL(cublasHandle_t * handle,
-                       void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                       const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-                       void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                       const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
                       void * c, TENSOR_DATA_TYPE dataTypeC,
                       int na, int ma, int nb, int mb, int nc, int mc,
                       DTYPE alpha, DTYPE beta)
@@ -88,7 +88,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
                              void ** c, TENSOR_DATA_TYPE dataTypeC,
@@ -144,7 +144,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
                                     void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
@@ -201,9 +201,9 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMULList(cublasHandle_t * handle,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle,
-                           XList * a, MATRIX_TRANS_TYPE transposedA,
+                           const XList * a, MATRIX_TRANS_TYPE transposedA,
-                           XList * b, MATRIX_TRANS_TYPE transposedB,
+                           const XList * b, MATRIX_TRANS_TYPE transposedB,
                           XList * c,
                           int count, DTYPE alpha, DTYPE beta)
 {
@@ -255,7 +255,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
    if (isUniform) {
        XMem * mem = a0->mem;
        if (isStrided && a->count > 1) {
-            CudaBLASMatrixMULBatchedStrided(handle,
+            _CudaBLASMatrixMULBatchedStrided(handle,
                                            a0->data, transposedA, a0->dataType, strideA / a0->unitSize,
                                            b0->data, transposedB, b0->dataType, strideB / b0->unitSize,
                                            c0->data, c0->dataType, strideC / c0->unitSize, a->count,
@@ -297,7 +297,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
            cudaMemcpy(bpGPU, bp, sizeof(DTYPE*) * b->count, cudaMemcpyHostToDevice);
            cudaMemcpy(cpGPU, cp, sizeof(DTYPE*) * c->count, cudaMemcpyHostToDevice);
-            CudaBLASMatrixMULBatched(handle,
+            _CudaBLASMatrixMULBatched(handle,
                                    (const void**)apGPU, transposedA, a0->dataType,
                                    (const void**)bpGPU, transposedB, b0->dataType,
                                    (void**)cpGPU, c0->dataType, a->count,
@@ -324,7 +324,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
            XTensor * bi = (XTensor*)b->GetItem(i);
            XTensor * ci = (XTensor*)c->GetItem(i);
-            CudaBLASMatrixMUL(handle,
+            _CudaBLASMatrixMUL(handle,
                              ai->data, transposedA, ai->dataType,
                              bi->data, transposedB, bi->dataType,
                              ci->data, ci->dataType,

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -28,21 +28,21 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication (BLAS) */
 extern "C"
-void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 #ifdef USE_CUDA
 /* matrix multiplication via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMUL(cublasHandle_t * handle,
+void _CudaBLASMatrixMUL(cublasHandle_t * handle,
-    void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
    void * c, TENSOR_DATA_TYPE dataTypeC,
    int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
    const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
    const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
    void ** c, TENSOR_DATA_TYPE dataTypeC,
@@ -50,7 +50,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
    void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
@@ -58,7 +58,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
    int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 #endif

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -30,15 +30,15 @@ convert data type
 >> input - input tensor
 >> output - output tensor
 */
-void ConvertTensorDataType(XTensor * input, XTensor * output)
+void _ConvertDataType(const XTensor * input, XTensor * output)
 {
-    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
    if (input->dataType == output->dataType)
        return;
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (input->devID >= 0) {
-        CudaConvertDataType(input, output);
+        _CudaConvertDataType(input, output);
    return;
 }
 #endif

--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -78,7 +78,7 @@ data conversion (cuda code)
 >> typeT - target data type
 >> size - number of the items in s (and t)
 */
-void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
+void _CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
 {
    CheckNTErrors((devID >= 0), "This code must be run on GPUs!");
@@ -112,9 +112,9 @@ convert data type (cuda code)
 >> input - input tensor
 >> output - output tensor
 */
-void CudaConvertDataType(XTensor * input, XTensor * output)
+void _CudaConvertDataType(const XTensor * input, XTensor * output)
 {
-    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
    if (input->dataType == output->dataType)
        return;

--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */
+#ifndef __CONVERTDATATYPE_CUH__
+#define __CONVERTDATATYPE_CUH__
 #include "ConvertDataType.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,8 +45,10 @@ __global__
 void KernelIntToFloat(int * inputData, float * outputData, int size);
 /* convert data type */
-void CudaConvertDataType(XTensor * input, XTensor * output);
+void _CudaConvertDataType(const XTensor * input, XTensor * output);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+#endif // __CONVERTDATATYPE_H__
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* convert data type */
-void ConvertDataType(XTensor * input, XTensor * output);
+void _ConvertDataType(const XTensor * input, XTensor * output);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -26,8 +26,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
-generate a tensor with seleccted data in range[low,high] along the given dimension 
+generate a tensor with selected data in range[low,high] along the given dimension 
 c = select(a) 
 >> a - input tensor
 >> c - result tensor
 >> dim - the dimension along with which we do the job
@@ -35,7 +37,7 @@ c = select(a)
 >> high - higher bound.
 Note that range [1,3] means that we select 1 and 2.
 */
-void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
+void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
 {
    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
@@ -76,4 +78,55 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    }
 }
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+c = select(a) 
+>> a - input tensor
+>> dim - the dimension along with which we do the job
+>> low - lower bound
+>> high - higher bound.
+   Note that range [1,3] means that we select 1 and 2.
+<< return - the result of the generated tensor with selected data
+*/
+XTensor SelectRange(const XTensor &a, int dim, int low, int high)
+{
+    int order = a.order;
+    int * dimSize = new int[order];
+    CheckNTErrors(&a != NULL, "Empty input tensors!");
+    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
+    CheckNTErrors(low < high, "Illegal range specified!");
+    for(int i = 0; i < a.order; i++){
+        if(i == dim){
+            CheckNTErrors(low > 0 && low < a.dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a.dimSize[dim], "Illegal range specified!");
+            dimSize[i] = high - low;
+        }
+        else
+            dimSize[i] = a.dimSize[i];
+    }
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+    /* call _SelectRange function */
+    _SelectRange(&a, &c, dim, low, high);
+    /* tensor connection */
+    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+    XLink::AddParamToHead(&c, low);
+    XLink::AddParamToHead(&c, high);
+    /* destroy variables */
+    delete dimSize;
+    return c;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/Select.cuh
+++ b/source/tensor/core/getandset/Select.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
+*/
+#ifndef __SELECT_CUH__
+#define __SELECT_CUH__
+#include "Select.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* generate a tensor with selected data c = select(a) */
+extern "C"
+void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension 
+c = select(a)
+*/
+extern "C" 
+void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __SELECT_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -26,14 +26,29 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
-/* generate a tensor with seleccted data c = select(a) */
+/* generate a tensor with selected data c = select(a) */
-extern "C" 
+extern "C"
-void Select(XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
+/* 
+generate a tensor with selected data c = select(a) (returna a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Select(const XTensor &a, XTensor &indexCPU);
-/* generate a tensor with seleccted data in range[low,high] along the given dimension 
+/* 
-   c = select(a) */
+generate a tensor with selected data in range[low,high] along the given dimension 
+c = select(a)
+*/
 extern "C" 
-void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high);
+void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+c = select(a)
+*/
+XTensor SelectRange(const XTensor &a, int dim, int low, int high);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -77,7 +77,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    else{
        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        SetDataRand(t2, low, high);
-        CopyValues(t2, tensor);
+        _CopyValues(t2, tensor);
        delete t2;
    }
 }

--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "Log.h"
 #include "Log.cuh"
+#include <math.h>
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its log value
 >> a - the tensor we are processing
 */
-void Log(XTensor * a)
+void _Log(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaLog(a);
+        _CudaLog(a);
    return;
 }
 #endif

--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
@@ -58,7 +58,7 @@ set each entry to its log value
 >> a - the tensor
 */
 extern "C"
-void CudaLog(XTensor * a)
+void _CudaLog(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");

--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */
+#ifndef __LOG_CUH__
+#define __LOG_CUH__
 #include "Log.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelLog(__half * d, int size);
 /* set each entry to its log value */
 extern "C"
-void CudaLog(XTensor * a);
+void _CudaLog(XTensor * a);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+#endif // __LOG_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set every entry to its log value */
 extern "C"
-void Log(XTensor * a);
+void _Log(XTensor * a);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -21,15 +21,18 @@
 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Normalize.h"
 #include "Normalize.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-normalized the data with normal distribution. For an input x,
+normalized the data with normal distribution
-y = a * (x-mean)/sqrt(variance+\epsilon) + b
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 >> input - the input tensor
 >> output - the output tensor
 >> dim - dimension alone which we generate the mean and variance
@@ -39,7 +42,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon)
+void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
 	int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!");
@@ -68,7 +71,7 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens
    if (input->devID >= 0 || output->devID >= 0) {
 #ifdef USE_CUDA
-        CudaNormalize(input, output, dim, mean, var, a, b, epsilon);
+        _CudaNormalize(input, output, dim, mean, var, a, b, epsilon);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -91,4 +94,61 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens
        }
    }
 }
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+{
+    _Normalize(input, input, dim, mean, var, a, b, epsilon);
+}
+/*
+normalized the data with normal distribution (return a XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+<< return - the result of normalized the data with normal distribution
+*/
+XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
+{
+    XTensor output(&input);
+    output.SetTMP();
+    /* call _Normalize function */
+    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
+    /* tensor connections */
+    XList list(5);
+    list.Add(&input);
+    list.Add(&mean);
+    list.Add(&var);
+    list.Add(&a);
+    list.Add(&b);
+    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
+    XLink::AddParamToHeadInt(&output, dim);
+    XLink::AddParamToHead(&output, epsilon);
+    return output;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -89,9 +89,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> epsilon - a parameter
 */
 extern "C"
-void CudaNormalize(XTensor * input, XTensor * output, int dim,
+void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
-                   XTensor * mean, XTensor * var,
+                   const XTensor * mean, const XTensor * var,
-                   XTensor * a, XTensor * b,
+                   const XTensor * a, const XTensor * b,
                   DTYPE epsilon)
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

--- a/source/tensor/core/math/Normalize.cuh
+++ b/source/tensor/core/math/Normalize.cuh
@@ -44,9 +44,9 @@ y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
 extern "C"
-void CudaNormalize(XTensor * input, XTensor * output, int dim,
+void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
-    XTensor * mean, XTensor * var,
+    const XTensor * mean, const XTensor * var,
-    XTensor * a, XTensor * b, DTYPE epsilon);
+    const XTensor * a, const XTensor * b, DTYPE epsilon);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -27,12 +27,29 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-normalized the data with normal distribution. For an input x,
+normalized the data with normal distribution. 
-y = a * (x-mean)/sqrt(variance+\epsilon) + b
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
 extern "C"
-void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon);
+void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+extern "C"
+void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+/*
+normalized the data with normal distribution (return a XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -31,12 +31,12 @@ get the power(a, p)
 >> a - the tensor
 >> p - as it is
 */
-void Power(XTensor * a, DTYPE p)
+void _Power(XTensor * a, DTYPE p)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaPower(a, p);
+        _CudaPower(a, p);
        return;
    }
 #endif

--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -96,7 +96,7 @@ void KernelPower(__half * d, __half p, int size)
 /* get the power of the entries */
 extern "C"
-void CudaPower(XTensor * a, DTYPE p)
+void _CudaPower(XTensor * a, DTYPE p)
 {
    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
@@ -38,7 +38,7 @@ void KernelSqrtV2(__half * d, int size);
 /* get the power of the entries */
 extern "C"
-void CudaPower(XTensor * a, DTYPE p);
+void _CudaPower(XTensor * a, DTYPE p);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get the power(x, y) */
 extern "C"
-void Power(XTensor * a, DTYPE p);
+void _Power(XTensor * a, DTYPE p);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -28,8 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
-scale and shift all tensor entires b = a * scale + shift
+scale and shift all tensor entires
 b = a * scale + shift
 >> a - the input tensor
 >> b - the output tensor
 >> scale - the scaler factor
@@ -76,8 +78,11 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
 }
 /* 
-scale and shift all tensor entires on site b = a * scale + shift
+scale and shift all tensor entires (do it on site)
-b = a * scale + shift
+keep the result in the input tensor a and return nothing
+a = a * scale + shift
 >> a - the input/output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
@@ -88,19 +93,22 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }
 /* 
-scale and shift all tensor entires b = a * scale + shift
+scale and shift all tensor entires (return a XTensor structure)
+make a new tensor to keep the result and return it
 b = a * scale + shift
 >> a - the input tensor
->> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
+<< return - the result of scaling and shifting all tensor entires
 */
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
 {
    XTensor b(&a);
    b.SetTMP();
-    /* computation */
+    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);
    /* tensor connections */

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -30,13 +30,24 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #define _LinearMe _ScaleAndShiftMe
 #define  Linear    ScaleAndShift
-/* scale and shift all tensor entires b = a * scale + shift */
+/* 
+scale and shift all tensor entires 
+b = a * scale + shift 
+*/
 void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0);
-/* scale and shift all tensor entires on site a = a * scale + shift */
+/*
+scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
 void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);
-/* scale and shift all tensor entires b = a * scale + shift, and return the result tensor b */
+/*
+scale and shift all tensor entires
+make a new tensor to keep the result and return it
+b = a * scale + shift 
+*/
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -36,7 +36,7 @@ copy a number of blocks to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA
@@ -44,7 +44,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
        int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
        XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));
-        CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
 #else
@@ -52,7 +52,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
 #endif
    }
    else {
-        CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
    }
 }
@@ -66,14 +66,14 @@ copy a number of blocks source source positions to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
+void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    if (myMem != NULL)
        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");
    if (devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
+        _CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
@@ -27,10 +27,10 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks to target positions */
-void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
 /* copy a number of blocks from source positions to target positions */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
+void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -38,7 +38,7 @@ Note that a grid may have a number of blocks
 >> myMem - the memory pool
 >> isIndexOnDev - indicates whether the index is on the device already
 */
-void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
+void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
                      int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
 {
    CheckNTErrors((unitSize == sizeof(int)), "TODO!");
@@ -51,7 +51,7 @@ void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, v
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }
-        CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
+        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
        if (!isIndexOnDev)
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));

--- a/source/tensor/core/movement/CopyBlocksInGrid.cu
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cu
@@ -216,7 +216,7 @@ Note that a grid may have a number of blocks
 >> itemSize - size of each data item
 >> myMem - the memory pool
 */
-void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem)
+void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL && myMem->devID >= 0), "This code must be run on GPUs!");
    CheckNTErrors((itemSize == sizeof(int)), "TODO!");

--- a/source/tensor/core/movement/CopyBlocksInGrid.cuh
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy data by index */
 extern "C"
-void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
+void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksInGrid.h
+++ b/source/tensor/core/movement/CopyBlocksInGrid.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks in grid */
 extern "C"
-void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
+void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
@@ -36,11 +36,11 @@ all the data has been on the device (CPU/GPU) already.
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -80,7 +80,7 @@ copy a number of blocks to target positions (cuda version)
 >> targetBlocks - target positions of the copy (on the device)
 >> myMem - memory pool
 */
-void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");
    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");

--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * targe
 /* copy a number of blocks to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks to target positions (on site) */
 extern "C"
-void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -70,7 +70,7 @@ copy a number of blocks from source positions to target positions (cuda version)
 >> targetBlocks - target positions of the copy
 >> myMem - memory pool
 */
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
+void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    CheckNTErrors((devID >= 0), "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");

--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,
 /* copy a number of blocks form source positions to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
+void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyData2D.cpp
+++ b/source/tensor/core/movement/CopyData2D.cpp
@@ -36,7 +36,7 @@ copy data blocks by 2d layout
 >> n - height of each block
 >> myMem - the memory pool
 */
-void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem)
+void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem)
 {
    int devID = myMem != NULL ? myMem->devID : -1;

--- a/source/tensor/core/movement/CopyData2D.h
+++ b/source/tensor/core/movement/CopyData2D.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy data blocks by 2d layout */
 extern "C"
-void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
+void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -36,7 +36,7 @@ in the k-th grid
 >> blockNumInGrid - number of blocks in each grid
 >> isIndexOnDev - indicates whether the index is on the device already
 */
-void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
+void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {
    CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!");
@@ -50,7 +50,7 @@ void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNu
    CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
    gridNum = s->unitNum / (blockSize * blockNum);
-    CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem);
+    _CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem);
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyInGrid.h
+++ b/source/tensor/core/movement/CopyInGrid.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
 extern "C"
-void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
+void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -21,11 +21,13 @@
 #include "CopyIndexed.h"
 #include "CopyBlocks.h"
+#include "../../XName.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 copy indexed sub-tensors
 >> s - the source tensor
 >> t - the target tensor
 >> dim - the leading dimension to define "sub-tensors"
@@ -34,11 +36,11 @@ copy indexed sub-tensors
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
->> copyNum - number of the sub-tensors we copy for each source index, e.g.,
+>> copyNum - number of the sub-tensors we copy for each source index, 
-for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
+   e.g., for srcIndex = [1,4] and copyNum = 2,
-<< return - whether copy indexed operation was successful
+   we actually copy the source sub-tensors 1, 2, 4, 5
 */
-bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -84,12 +86,62 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
    }
-    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);
+    _CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);
    delete[] realSrcIndex;
    delete[] realTgtIndex;
+}
+/*
+copy indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> s - the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+>> tgtIndex - index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+   e.g., for srcIndex = [1,4] and copyNum = 2,
+   we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+{
+    CheckNTErrors(&s, "Empty input tensor!");
+    CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
+    int order = s.order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = indexSize * copyNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
+    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    t.SetZeroAll();
+    t.SetTMP();
+    /* call _CopyIndexed function */
+    _CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    /* destroy variables */
+    delete dimSize;
+    /* tensor connection */
+    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
+    XLink::AddParamToHead(&t, dim);
+    XLink::AddParamToHeadPointer(&t, srcIndex);
+    XLink::AddParamToHead(&t, indexSize);
+    XLink::AddParamToHeadPointer(&t, tgtIndex);
+    XLink::AddParamToHead(&t, copyNum);
-    return true;
+    return t;
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy selected sub-tensors */
 extern "C"
-bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+/* 
+copy selected sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -27,18 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 copy s to t
 >> s - source
 >> t - target
 >> stream - the stream for creating the job pipeline
-<< return - succeeded or not
 */
-bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 {
-    if (s == NULL || t == NULL)
+    CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
-        return false;
+    CheckNTErrors((s->data != NULL), "Cannot copy from an empty data array!");
-    if (s->data == NULL || t->data == NULL)
-        return false;
    CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
    CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");
@@ -48,12 +45,13 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
            "The code must be run on the same device!");
        CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
        ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
-        return true;
    }
 #ifdef USE_CUDA
-    if (s->devID >= 0 || t->devID >= 0)
+    if (s->devID >= 0 || t->devID >= 0) {
-        return CudaCopyValues(s, t, stream);
+        _CudaCopyValues(s, t, stream);
+        return;
+    }
 #endif
    if (!s->isSparse && !t->isSparse) {
@@ -68,8 +66,28 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
    else {
        ShowNTErrors("TODO!");
    }
+}
+/*
+copy s to t (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> s - source
+>> stream - the stream for creating the job pipeline
+<< return - the copyed tensor t
+*/
+XTensor CopyValues(const XTensor &s, XStream * stream)
+{
+    XTensor t(&s);
+    t.SetTMP();
+    /* call _CopyValues function */
+    _CopyValues(&s, &t, stream);
+    /* tensor connection */
+    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYVALUES);
-    return true;
+    return t;
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -35,11 +35,9 @@ copy a range of elements from a source vector to a target vector
 >> stream - the stream for creating the job pipeline
 << return - succeed or not
 */
-bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
 {
-    if (s == NULL || t == NULL)
+    CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
-        return false;
    CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
    CheckNTErrors((s->unitSize == t->unitSize), "Incompatible vectors in value copy.");
    CheckNTErrors((s->denseRatio <= s->denseRatio), "Incompatible vectors in value copy.");
@@ -83,8 +81,6 @@ bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
    else {
        ShowNTErrors("TODO!");
    }
-    return true;
 }

--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy all elements from a source matrix to a target matrix */
 extern "C"
-bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy s to t */
 extern "C"
-bool CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+/* 
+copy s to t (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -27,12 +27,13 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
-get the max value of the items along a dimension of the tensor. 
+get the max value of the items along a dimension of the tensor
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 */
-void ReduceMax(XTensor * input, XTensor * output, int dim)
+void _ReduceMax(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
                  "This code must be run on the same device!");
@@ -55,7 +56,7 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
    if(input->devID >= 0){
 #ifdef USE_CUDA
-        CudaReduceMax(input, output, dim);
+        _CudaReduceMax(input, output, dim);
 #endif
    }
    else{
@@ -90,4 +91,43 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
    }
 }
+/* 
+get the max value of the items along a dimension of the tensor (return a XTensor structure).
+make a new tensor to keep the result and return it
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+<< return - the max value of the items along a dimension of the tensor
+*/
+XTensor ReduceMax(const XTensor &input, int dim)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+    /* call _ReduceMax function */
+    _ReduceMax(&input, &output, dim);
+    /* destroy variables */
+    delete dimSize;
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+    XLink::AddParamToHead(&output, dim);
+    return output;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -334,7 +334,7 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
 >> output - the output tensor
 >> dim - which dimension to reduce
 */
-void CudaReduceMax(XTensor * input, XTensor * output, int dim)
+void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input && output), "Empty input or output tensors!");
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");

--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max-valued items along a dimension of the tensor (cuda version) */
 extern "C" 
-void CudaReduceMax(XTensor * input, XTensor * output, int dim);
+void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -28,7 +28,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
 extern "C"
-void ReduceMax(XTensor * input, XTensor * output, int dim);
+void _ReduceMax(const XTensor * input, XTensor * output, int dim);
+/* 
+get the max value of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMax(const XTensor &input, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
--- a/source/tensor/core/reduce/ReduceStandardVariance.h
+++ b/source/tensor/core/reduce/ReduceStandardVariance.h
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
--- a/source/tensor/core/reduce/ReduceSum.cuh
+++ b/source/tensor/core/reduce/ReduceSum.cuh
--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cu
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cu
--- a/source/tensor/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cuh
--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
--- a/source/tensor/core/shape/Unsqueeze.cuh
+++ b/source/tensor/core/shape/Unsqueeze.cuh
--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
--- a/source/tensor/core/sort/Sort.cuh
+++ b/source/tensor/core/sort/Sort.cuh
--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h