merge with xuchen branch

906eebb7 · xuchen · 7283b3dc · b3a76184 · 906eebb7 · 906eebb7
Commit 906eebb7 authored Jul 19, 2018 by xuchen
--- a/source/network/XBackwardLoss.cpp
+++ b/source/network/XBackwardLoss.cpp
@@ -70,7 +70,7 @@ void XLossGrad::Compute(XTensor * gold, XTensor * y,
                        XTensor * dedy, 
                        LOSS_FUNCTION_NAME lossName)
 {
-    LossBackward(dedy, gold, y, lossName);
+    _LossBackward(dedy, gold, y, lossName);
 }

 }
\ No newline at end of file
--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -328,6 +328,37 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
 }

 /* 
+create a hyper edge with a input tensors and a list of output tensors
+>> h - a input tensor
+>> list - a list of output tensors
+>> id - id of the edge type
+*/
+void XLink::MakeLink(XTensor * t, XList * list, int id)
+{
+    /* forward */
+    for(int i = 0; i < list->count; i++){
+        XTensor * h = (XTensor*)list->GetItem(i);
+        if(h == NULL)
+            continue;
+        XLink &income = h->income;
+        income.Reset();
+        income.SetHead(h);
+        income.SetType(id);
+        income.AddTail(t);
+    }
+
+    /* backward */
+    XLink &outgo = t->outgo;
+    CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
+    for(int i = 0; i < list->count; i++){
+        XTensor * t = (XTensor*)list->GetItem(i);
+        if(t == NULL)
+            continue;
+        outgo.AddTail(t);
+    }
+}
+
+/* 
 add parameters 
 >> h - head
 >> param - parameter we want introduce

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -139,6 +139,10 @@ struct XLink
    static
    void MakeLink(const XList * list, XTensor * h, int id);

+    /* create a hyper edge with a input tensors and a list of output tensors */
+    static
+    void MakeLink(XTensor * h, XList * list, int id);
+
    /* add a parameter */
    static
    void AddParamToHead(XTensor * h, DTYPE param);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -26,76 +26,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* get operator name */
 const char * GetOPName(int type)
 {
-    if((type & MATH_BASE) != 0){
-        if(type == MATH_ABSOLUTE)
+    if ((type & MATH_BASE) != 0){
+        if (type == MATH_ABSOLUTE)
            return "M_ABSOLUTE";
-        else if(type == MATH_MATRIXMUL)
+        else if (type == MATH_MATRIXMUL)
            return "M_MATRIXMUL";
-        else if(type == MATH_MATRIXMULBATCHED)
+        else if (type == MATH_MATRIXMULBATCHED)
            return "M_MATRIXMULBATCHED";
-        else if(type == MATH_MULTIPLY)
+        else if (type == MATH_MULTIPLY)
            return "M_MULTIPLY";
-        else if(type == MATH_NEGATE)
+        else if (type == MATH_NEGATE)
            return "M_NEGATE";
-        else if(type == MATH_SIGN)
+        else if (type == MATH_SIGN)
            return "M_SIGN";
-        else if(type == MATH_SUM)
+        else if (type == MATH_SUM)
            return "M_SUM";
-        else if(type == MATH_LOG)
-            return "M_NORMALIZE";
-        else if(type == MATH_NORMALIZE)
+        else if (type == MATH_LOG)
            return "M_LOG";
-        else if(type == MATH_POWER)
+        else if (type == MATH_NORMALIZE)
+            return "M_NORMALIZE";
+        else if (type == MATH_POWER)
            return "M_POWER";
-        else if(type == MATH_SCALEANDSHIFT)
+        else if (type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
-        else if(type == REDUCE_REDUCEMAX)
+        else if (type == REDUCE_REDUCEMAX)
            return "R_REDUCEMAX";
-        else if(type == REDUCE_REDUCEMEAN)
+        else if (type == REDUCE_REDUCEMEAN)
            return "R_REDUCEMEAN";
-        else if(type == REDUCE_REDUCESUM)
+        else if (type == REDUCE_REDUCESUM)
            return "R_REDUCESUM";
-        else if(type == REDUCE_REDUCESUMSQUARED)
+        else if (type == REDUCE_REDUCESUMSQUARED)
            return "R_REDUCESUMSQUARED";
-        else if(type == REDUCE_REDUCEVARIANCE)
+        else if (type == REDUCE_REDUCEVARIANCE)
            return "R_REDUCEVARIANCE";
    }
-    else if((type & DATA_BASE) != 0){
-        if(type == GETANDSET_SELECT)
+    else if ((type & DATA_BASE) != 0){
+        if (type == GETANDSET_SELECT)
            return "G_SELECT";
-        else if(type == MOVEMENT_COPYINDEXED)
+        else if (type == MOVEMENT_COPYINDEXED)
            return "M_COPYINDEXED";
-        else if(type == MOVEMENT_COPYVALUES)
+        else if (type == MOVEMENT_COPYVALUES)
            return "M_COPYVALUES";
-        else if(type == SHAPE_CONCATENATE)
+        else if (type == SHAPE_CONCATENATE)
            return "S_CONCATENATE";
-        else if(type == SHAPE_MERGE)
+        else if (type == SHAPE_MERGE)
            return "S_MERGE";
-        else if(type == SHAPE_MERGE_LIST)
+        else if (type == SHAPE_MERGE_LIST)
            return "S_MERGE_LIST";
-        else if(type == SHAPE_PERMUTE)
+        else if (type == SHAPE_PERMUTE)
            return "S_PERMUTE";
-        else if(type == SHAPE_SPLIT)
+        else if (type == SHAPE_SPLIT)
            return "S_SPLIT";
-        else if(type == SHAPE_SPLIT_LIST)
+        else if (type == SHAPE_SPLIT_LIST)
            return "S_SPLIT_LIST";
-        else if(type == SHAPE_TRANSPOSE)
+        else if (type == SHAPE_TRANSPOSE)
            return "S_TRANSPOSE";
-        else if(type == SHAPE_UNSQUEEZE)
+        else if (type == SHAPE_UNSQUEEZE)
            return "S_UNSQUEEZE";
+        else if (type == SORT_SORT)
+            return "S_SORT";
+        else if (type == SORT_TOPK)
+            return "S_TOPK";
    }
-    else if((type & FUNCTION_BASE) != 0){
-        if(type == FUNC_HARDTANH)
+    else if ((type & FUNCTION_BASE) != 0){
+        if (type == FUNC_HARDTANH)
            return "F_HARDTANH";
-        else if(type == FUNC_IDENTITY)
+        else if (type == FUNC_IDENTITY)
            return "F_IDENTITY";
-        else if(type == FUNC_LOGSOFTMAX)
+        else if (type == FUNC_LOGSOFTMAX)
            return "F_LOGSOFTMAX";
-        else if(type == FUNC_RECTIFY)
+        else if (type == FUNC_RECTIFY)
            return "F_RECTIFY";
-        else if(type == FUNC_SIGMOID)
+        else if (type == FUNC_SIGMOID)
            return "F_SIGMOID";
-        else if(type == FUNC_SOFTMAX)
+        else if (type == FUNC_SOFTMAX)
            return "F_SOFTMAX";
    }
    

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -69,6 +69,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #define SHAPE_TRANSPOSE         SHAPE_SPLIT_LIST + 1
 #define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1

+#define SORT                    SHAPE_UNSQUEEZE + 1
+#define SORT_SORT               SORT + 1
+#define SORT_TOPK               SORT_SORT + 1
+
 /* activation functions */
 #define FUNCTION_BASE           DATA_BASE * 2
 #define FUNC_HARDTANH           FUNCTION_BASE + 1

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -1121,7 +1121,7 @@ bool XTensor::Resize(const int myOrder, const int * myDimSize,
    if(isSparse){
        /*
        for sparse matrices, we use a list of tuple (key, value), 
-        ordered by key. Take a (2-dimensional) matrix as an examples, 
+        ordered by key. Take a (2-dimensional) matrix as an example, 
        we have key = m * i + j;
        The data array is
        ---------

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -486,8 +486,9 @@ quick sorting
            NOTE: this means that the items may not placed in a continuous memory space
 >> comp - the comparison function 
 */
-void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
+void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
 {
+    XMemCopy(dataB, -1, dataA, -1, num * width);
    char *lo, *hi;         // ends of sub-array currently sorting
    int *indexlo, *indexhi;
    char *mid;             // points to middle of subarray
@@ -506,8 +507,8 @@ void XQSort(void * data, void * index, int num, int width, int stride, int (*com

    stackptr = 0;

-    lo = (char*)data;
-    hi = (char*)data + realStride * (num - 1);
+    lo = (char*)dataB;
+    hi = (char*)dataB + realStride * (num - 1);
    indexlo = (int*)index;
    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;


--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
 extern double GetClock();
 extern double GetClockSec();

-extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
+extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
 extern int CompXFloat(const void * a, const void * b);

 #ifdef USE_CUDA

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Absolute.h"
 #include "Absolute.cuh"

@@ -28,21 +29,54 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 set every entry to its absolute value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Absolute(XTensor * a)
+void _Absolute(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaAbsolute(a);
+        _CudaAbsolute(a, b);
    return;
 }
 #endif

+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = (DTYPE)fabs(d[i]);
+        db[i] = (DTYPE)fabs(d[i]);
+}
+
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _AbsoluteMe(XTensor * a)
+{
+    _Absolute(a, a);
+}
+
+/*
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the absolute value of input tensor
+*/
+XTensor Absolute(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Absolute function */
+    _Absolute(&a, &b);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_ABSOLUTE);
+    
+    return b;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its absolute value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(DTYPE * d, int size)
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = fabs(d[i]);
+        b[i] = fabs(a[i]);
 }

 /*
 set each entry to its absolute value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelAbsolute(__half * d, int size)
+void KernelAbsolute(__half * a, __half * b, int size)
 {
    return;
 }

 /*
-set each entry to its  with float16 data type value
->> a - the tensor
+set each entry to its absolute value
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaAbsolute(XTensor * a)
+void _CudaAbsolute(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -74,10 +78,10 @@ void _CudaAbsolute(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelAbsolute << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelAbsolute << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelAbsolute << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -27,15 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set each entry to its absolute value (CUDA Kernel) */
 __global__
-void KernelAbsolute(DTYPE * d, int size);
+void KernelAbsolute(DTYPE * a, DTYPE * b, int size);

 /* set each entry to its absolute value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelAbsolute(__half * d, int size);
+void KernelAbsolute(__half * a, __half * b, int size);

 /* set each entry to its absolute value */
 extern "C"
-void _CudaAbsolute(XTensor * a);
+void _CudaAbsolute(const XTensor * a, XTensor * b);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its absolute value */
-extern "C"
-void _Absolute(XTensor * a);
+void _Absolute(const XTensor * a, XTensor * b);
+
+/*
+set every entry to its absolute value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _AbsoluteMe(XTensor * a);
+
+/* 
+set every entry to its absolute value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Absolute(const XTensor & a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
@@ -38,11 +38,11 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> beta - scalar
 */
 void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
-                         const XList * b, MATRIX_TRANS_TYPE transposedB,
-                         XList * c, DTYPE alpha, DTYPE beta)
+                          const XList * b, MATRIX_TRANS_TYPE transposedB,
+                          XList * c, DTYPE alpha, DTYPE beta)
 {
-    CheckNTErrors((a && b && c), "Empty input lists!");
-    CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");
+    CheckNTErrors(a && b && c, "Empty input lists!");
+    CheckNTErrors(a->count == b->count && a->count == c->count, "Input lists must be of the same size!");

    if (a->count == 0)
        return;

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -28,8 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* matrix multiplication in batch mode (CPU code) */
 extern "C"
-void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
+                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -66,4 +66,4 @@ XTensor MatrixMul(const XTensor &a, const XTensor &b,

 } // namespace nts(NiuTrans.Tensor)

-#endif // __MATRIXMUL_H__
+#endif // __MATRIXMUL_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -45,9 +45,9 @@ where trans() return the transposed matrix if the flag is fired
 >> stream - the string for creating the job pipeline
 */
 void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                 const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                 XTensor * c, DTYPE alpha, DTYPE beta,
-                 XPRunner * parallelRunner, XStream * stream)
+                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                  XTensor * c, DTYPE alpha, DTYPE beta,
+                  XPRunner * parallelRunner, XStream * stream)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType), "Input tensors should                have the same data type!");

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -122,9 +122,8 @@ where trans() return the transposed matrix if the flag is fired
 >> stream - the string for creating the job pipeline
 */
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                     const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                     XTensor * c,
-                     DTYPE alpha, DTYPE beta, XStream * stream)
+                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                      XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
 {
    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -44,7 +44,7 @@ where trans() return the transposed matrix if the flag is fired
 */
 extern "C"
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
+                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -41,8 +41,8 @@ where trans() return the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 */
 void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                         const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                         XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+                          const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                          XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
@@ -32,8 +32,8 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired.
 */
 extern "C"
-void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -46,9 +46,8 @@ where trans() returns the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 */
 void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                      XTensor * c, DTYPE alpha, DTYPE beta,
-                      XPRunner * parallelRunner)
+                       const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                       XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
@@ -169,14 +168,12 @@ where trans() returns the transposed matrix if the flag is fired.
 >> b - tensor b
 >> transposedB - indicates whether teh matrices in b are transposed
 >> alpha - a coefficient
->> beta - another coefficient
 >> parallelRunner - parallel processing module
 << return - the result of matrix multiplication of the two tensors
 */
 XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
-                        DTYPE alpha, XPRunner * parallelRunner)
+                         DTYPE alpha, XPRunner * parallelRunner)
 {
-    CheckNTErrors(&a != &NULLTensor && &b != &NULLTensor, "Empty input tensors!");
    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
    CheckNTErrors(a.order >= 2 && b.order >= 2, "Input tensors must have a order >= 2!");
    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
@@ -191,13 +188,13 @@ XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const 
    int order = a.order;
    int sub = 0;
    int * dimSize = new int[order];
-    for (int i = 2; i < a.order; i++)
-        dimSize[sub++] = a.dimSizeRDI[i];
+    for (int i = 0; i < a.order - 2; i++)
+        dimSize[sub++] = a.dimSize[i];
    dimSize[sub++] = an;
    dimSize[sub++] = bm;

-    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
-    c.SetZeroAll();
+    float dr = (!a.isSparse || !b.isSparse) ? 1.0F : MAX(a.denseRatio, b.denseRatio);
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();

    /*call _MatrixMulBatched function */

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Negate.h"
 #include "Negate.cuh"

@@ -27,21 +28,55 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 set every entry to its minus value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Negate(XTensor * a)
+void _Negate(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaNegate(a);
+        _CudaNegate(a, b);
    return;
-}
+    }
 #endif

+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = -d[i];
+        db[i] = -d[i];
+}
+
+/*
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _NegateMe(XTensor * a)
+{
+    _Negate(a, a);
 }
+
+/*
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the minus value of input tensor
+*/
+XTensor Negate(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Negate function */
+    _Negate(&a, &b);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_NEGATE);
+    
+    return b;
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -29,45 +29,49 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its negtive value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to the input data array
+>> b - pointer to the output data array
 >> size - size of the data array
 */
 __global__
-void KernelNegate(DTYPE * d, int size)
+void KernelNegate(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = -d[i];
+        b[i] = -a[i];
 }

 /*
 set each entry to its negtive value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to the input data array
+>> b - pointer to the output data array
 >> size - size of the data array
 */
 __global__
-void KernelNegate(__half * d, int size)
+void KernelNegate(__half * a, __half * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
        if (i < size)
-            d[i] = __hsub(__float2half(0), d[i]);
+            b[i] = __hsub(__float2half(0), a[i]);
 #else
        if (i < size)
-            d[i] = __float2half(-__half2float(d[i]));
+            b[i] = __float2half(-__half2float(a[i]));
 #endif
 }

 /*
 set each entry to its negtive value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaNegate(XTensor * a)
+void _CudaNegate(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -82,10 +86,10 @@ void _CudaNegate(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelNegate << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelNegate << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set each entry to its negtive value (CUDA Kernel) */
 __global__
-void KernelNegate(DTYPE * d, int size);
+void KernelNegate(DTYPE * a, DTYPE * b, int size);

 /* set each entry to its negtive value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelNegate(__half * d, int size);
+void KernelNegate(__half * a, __half * b, int size);

 /* set each entry to its negtive value */
 extern "C"
-void _CudaNegate(XTensor * a);
+void _CudaNegate(const XTensor * a, XTensor * b);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its minus value */
-extern "C"
-void _Negate(XTensor * a);
+void _Negate(const XTensor * a, XTensor * b);
+
+/* 
+set every entry to its minus value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _NegateMe(XTensor * a);
+
+/* 
+set every entry to its minus value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Negate(const XTensor & a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Sign.h"
 #include "Sign.cuh"

@@ -27,27 +28,60 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 set every entry to its sign value
->> a - the tensor we are processing
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Sign(XTensor * a)
+void _Sign(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaSign(a);
+        _CudaSign(a, b);
    return;
 }
 #endif

+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++) {
        if (d[i] > 0)
-            d[i] = 1.0F;
+            db[i] = 1.0F;
        else if (d[i] == 0)
-            d[i] = 0.0F;
+            db[i] = 0.0F;
        else
-            d[i] = -1.0F;
+            db[i] = -1.0F;
    }
 }
+
+/*
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _SignMe(XTensor * a)
+{
+    _Sign(a, a);
+}
+
+/*
+set every entry to its sign value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the sign value of the input tensor
+*/
+XTensor Sign(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+
+    /* call _ScaleAndShift function */
+    _Sign(&a, &b);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_SIGN);
+
+    return b;
+}
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -29,43 +29,47 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its sign value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelSign(DTYPE * d, int size)
+void KernelSign(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size) {
-        if (d[i] > 0)
-            d[i] = 1.0F;
-        else if (d[i] == 0)
-            d[i] = 0.0F;
+        if (a[i] > 0)
+            b[i] = 1.0F;
+        else if (a[i] == 0)
+            b[i] = 0.0F;
        else
-            d[i] = -1.0F;
+            b[i] = -1.0F;
    }
 }

 /*
-set each entry to its sign value (CUDA Kernel)
+set each entry to its sign value with float16 data type value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelSign(__half * d, int size)
+void KernelSign(__half * a, __half * b, int size)
 {
    return;
 }

 /*
-set each entry to its  with float16 data type value
->> a - the tensor
+set each entry to its sign value
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
 extern "C"
-void _CudaSign(XTensor * a)
+void _CudaSign(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -80,10 +84,10 @@ void _CudaSign(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-        KernelSign << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelSign << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelSign << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set each entry to its sign value (CUDA Kernel) */
 __global__
-void KernelSign(DTYPE * d, int size);
+void KernelSign(DTYPE * a, DTYPE * b, int size);

 /* set each entry to its sign value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelSign(__half * d, int size);
+void KernelSign(__half * a, __half * b, int size);

 /* set each entry to its sign value */
 extern "C"
-void _CudaSign(XTensor * a);
+void _CudaSign(const XTensor * a, XTensor * b);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its sign value */
-extern "C"
-void _Sign(XTensor * a);
+void _Sign(const XTensor * a, XTensor * b);
+
+/* 
+set every entry to its sign value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _SignMe(XTensor * a);
+
+/* 
+set every entry to its sign value  (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Sign(const XTensor & a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -37,8 +37,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta
 >> c - output matrix (2d tensor)
 */
 void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                  XTensor * c, DTYPE alpha, DTYPE beta)
+                   const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                   XTensor * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -89,11 +89,11 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
-                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-                              void ** c, TENSOR_DATA_TYPE dataTypeC,
-                              int count, int na, int ma, int nb, int mb, int nc, int mc,
-                              DTYPE alpha, DTYPE beta)
+                               const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                               const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                               void ** c, TENSOR_DATA_TYPE dataTypeC,
+                               int count, int na, int ma, int nb, int mb, int nc, int mc,
+                               DTYPE alpha, DTYPE beta)
 {
    /*
    matrxi-matrix multiplication
@@ -145,11 +145,11 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
-                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
-                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
-                                     void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
-                                     int count, int na, int ma, int nb, int mb, int nc, int mc,
-                                     DTYPE alpha, DTYPE beta)
+                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
+                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
+                                      void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
+                                      int count, int na, int ma, int nb, int mb, int nc, int mc,
+                                      DTYPE alpha, DTYPE beta)
 {
    /*
    matrxi-matrix multiplication
@@ -202,10 +202,10 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 matrix multiplication via cuda version BLAS
 */
 void _CudaBLASMatrixMULList(cublasHandle_t * handle,
-                           const XList * a, MATRIX_TRANS_TYPE transposedA,
-                           const XList * b, MATRIX_TRANS_TYPE transposedB,
-                           XList * c,
-                           int count, DTYPE alpha, DTYPE beta)
+                            const XList * a, MATRIX_TRANS_TYPE transposedA,
+                            const XList * b, MATRIX_TRANS_TYPE transposedB,
+                            XList * c,
+                            int count, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");
    CheckNTErrors((a->count == b->count && a->count == c->count), "Input lists must be of the same size!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -28,38 +28,42 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* matrix multiplication (BLAS) */
 extern "C"
-void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
+                   XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 #ifdef USE_CUDA

 /* matrix multiplication via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMUL(cublasHandle_t * handle,
-    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-    void * c, TENSOR_DATA_TYPE dataTypeC,
-    int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                        const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                        const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                        void * c, TENSOR_DATA_TYPE dataTypeC,
+                        int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
-    const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
-    void ** c, TENSOR_DATA_TYPE dataTypeC,
-    int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                               const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                               const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+                               void ** c, TENSOR_DATA_TYPE dataTypeC,
+                               int count, int na, int ma, int nb, int mb, int nc, int mc, 
+                               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
-    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
-    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
-    void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
-    int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
+                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
+                                      void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
+                                      int count, int na, int ma, int nb, int mb, int nc, int mc, 
+                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
-    int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
+                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 #endif
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -96,7 +96,6 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
    int order = a.order;
    int * dimSize = new int[order];

-    CheckNTErrors(&a != NULL, "Empty input tensors!");
    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
    CheckNTErrors(low < high, "Illegal range specified!");
    
@@ -110,8 +109,8 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high)
            dimSize[i] = a.dimSize[i];
    }

-    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
-    c.SetZeroAll();
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor c(order, dimSize, a.dataType, dr, a.devID, a.mem);
    c.SetTMP();

    /* call _SelectRange function */

--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Log.h"
 #include "Log.cuh"
 #include <math.h>
@@ -27,22 +28,55 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-set every entry to its log value
->> a - the tensor we are processing
+set every entry to its log value (do it on site)
+>> a - input tensor we are processing
+>> b - output tensor we are processing
 */
-void _Log(XTensor * a)
+void _Log(const XTensor * a, XTensor * b)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaLog(a);
+        _CudaLog(a, b);
    return;
-}
+    }
 #endif

+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * db = (DTYPE*)b->data;
    for (int i = 0; i < a->unitNum; i++)
-        d[i] = (DTYPE)log(d[i]);
+        db[i] = (DTYPE)log(d[i]);
+}
+
+/*
+set every entry to its log value
+keep the result in the input tensor a and return nothing
+>> a - the tensor we are processing
+*/
+void _LogMe(XTensor * a)
+{
+    _Log(a, a);
+}
+
+/*
+set every entry to its log value (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor we are processing
+<< return - the log value of the input tensor
+*/
+XTensor Log(const XTensor & a)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Log function */
+    _Log(&a, &b);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_LOG);
+    
+    return b;
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
@@ -29,37 +29,41 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 set each entry to its log value (CUDA Kernel)
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelLog(DTYPE * d, int size)
+void KernelLog(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = log(d[i]);
+        b[i] = log(a[i]);
 }

 /*
 set each entry to its log value (CUDA Kernel)
 This is for float16 computation
->> d - pointer to the data array
+>> a - pointer to input data array
+>> b - pointer to output data array
 >> size - size of the data array
 */
 __global__
-void KernelLog(__half * d, int size)
+void KernelLog(__half * a, __half * b, int size)
 {
    return;
 }

 /*
 set each entry to its log value
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 */
 extern "C"
-void _CudaLog(XTensor * a)
+void _CudaLog(const XTensor * a, XTensor * b)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");

    int gridSize[3];
@@ -74,10 +78,10 @@ void _CudaLog(XTensor * a)
    ProtectCudaDev(a->devID, devIDBackup);

    if (a->dataType == DEFAULT_DTYPE) {
-        KernelLog << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+        KernelLog << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
    }
    else if (a->dataType == X_FLOAT16) {
-        KernelLog << <blocks, threads >> >((__half*)a->data, a->unitNum);
+        KernelLog << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
    }
    else {
        ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set each entry to its log value (CUDA Kernel) */
 __global__
-void KernelLog(DTYPE * d, int size);
+void KernelLog(DTYPE * a, DTYPE * b, int size);

 /* set each entry to its log value (CUDA Kernel) with float16 data type*/
 __global__
-void KernelLog(__half * d, int size);
+void KernelLog(__half * a, __half * b, int size);

 /* set each entry to its log value */
 extern "C"
-void _CudaLog(XTensor * a);
+void _CudaLog(const XTensor * a, XTensor * b);

 #endif // USE_CUDA


--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its log value */
-extern "C"
-void _Log(XTensor * a);
+void _Log(const XTensor * a, XTensor * b);
+
+/* 
+set every entry to its log value (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _LogMe(XTensor * a);
+
+/* 
+set every entry to its log value (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Log(const XTensor & a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -21,6 +21,7 @@

 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Power.h"
 #include "Power.cuh"

@@ -28,38 +29,73 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 get the power(a, p)
->> a - the tensor
->> p - as it is
+>> a - input tensor
+>> b - output tensor
+>> p - parameter
 */
-void _Power(XTensor * a, DTYPE p)
+void _Power(const XTensor * a, XTensor * b, DTYPE p)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        _CudaPower(a, p);
+        _CudaPower(a, b, p);
        return;
    }
 #endif

    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");

-    DTYPE * d = (DTYPE*)a->data;
+    DTYPE * aData = (DTYPE*)a->data;
+    DTYPE * bData = (DTYPE*)b->data;
    if (p == 0) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)1.0;
+            bData[i] = (DTYPE)1.0;
    }
    else if (p == (DTYPE)0.5) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)sqrt(d[i]);
+            bData[i] = (DTYPE)sqrt(aData[i]);
    }
    else if (p == (DTYPE)2.0) {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = d[i] * d[i];
+            bData[i] = aData[i] * aData[i];
    }
    else {
        for (int i = 0; i < a->unitNum; i++)
-            d[i] = (DTYPE)pow(d[i], p);
+            bData[i] = (DTYPE)pow(aData[i], p);
    }
 }

+/*
+get the power(a, p) (do it on site)
+keep the result in the input tensor a and return nothing
+>> a - the tensor
+>> p - parameter
+*/
+void _PowerMe(XTensor * a, DTYPE p)
+{
+    _Power(a, a, p);
+}
+
+/*
+get the power(a, p) (return a XTensor structure)
+make a new tensor to keep the result and return it
+>> a - input tensor
+>> p - parameter
+<< return - the power value of the input tensor
+*/
+XTensor Power(const XTensor & a, DTYPE p)
+{
+    XTensor b(&a);
+    b.SetTMP();
+    
+    /* call _Power function */
+    _Power(&a, &b, p);
+    
+    /* tensor connections */
+    XLink::MakeLink(&a, NULL, &b, MATH_POWER);
+    XLink::AddParamToHead(&b, p);
+
+    return b;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -21,6 +21,7 @@

 #include "../../XDevice.h"
 #include "../../XTensor.h"
+#include "../movement/CopyValues.cuh"
 #include "Power.h"
 #include "Power.cuh"

@@ -30,74 +31,80 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 set all entries to its root (CUDA Kernel)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> size - size of the data array
 */
 __global__
-void KernelSqrtV2(DTYPE * d, int size)
+void KernelSqrtV2(DTYPE * a, DTYPE * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = sqrt(d[i]);
+        b[i] = sqrt(a[i]);
 }

 /*
 set all entries to its root (CUDA Kernel)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> size - size of the data array
 */
 __global__
-void KernelSqrtV2(__half * d, int size)
+void KernelSqrtV2(__half * a, __half * b, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
    if (i < size)
-        d[i] = hsqrt(d[i]);
+        b[i] = hsqrt(a[i]);
 #else
    if (i < size)
-        d[i] = __float2half(sqrt(__half2float(d[i])));
+        b[i] = __float2half(sqrt(__half2float(a[i])));
 #endif
 }


 /*
 get power(d[i], p)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> p - power
 >> size - size of the data array
 */
 __global__
-void KernelPower(DTYPE * d, DTYPE p, int size)
+void KernelPower(DTYPE * a, DTYPE * b, DTYPE p, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < size)
-        d[i] = pow(d[i], p);
+        b[i] = pow(a[i], p);
 }

 /*
 get power(d[i], p)
->> d - data array
+>> a - input data array
+>> b - output data array
 >> p - power
 >> size - size of the data array
 */
 __global__
-void KernelPower(__half * d, __half p, int size)
+void KernelPower(__half * a, __half * b, __half p, int size)
 {
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
-        d[i] = __float2half(pow(__half2float(d[i]), __half2float(p)));
+        b[i] = __float2half(pow(__half2float(a[i]), __half2float(p)));
 #endif
 }

 /* get the power of the entries */
 extern "C"
-void _CudaPower(XTensor * a, DTYPE p)
+void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    
    int gridSize[3];
    int blockSize[3];

@@ -111,15 +118,18 @@ void _CudaPower(XTensor * a, DTYPE p)

    if (a->dataType == DEFAULT_DTYPE) {
        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, a->unitNum);
+            KernelSqrtV2 << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, a->unitNum);
+        }
+        else if (p == (DTYPE)1.0) {
+            _CudaCopyValues(a, b);
        }
        else if (p != (DTYPE)1.0) {
-            KernelPower << <blocks, threads >> >((DTYPE*)a->data, p, a->unitNum);
+            KernelPower << <blocks, threads >> >((DTYPE*)a->data, (DTYPE*)b->data, p, a->unitNum);
        }
    }
    else if (a->dataType == X_FLOAT16) {
        if (p == (DTYPE)0.5) {
-            KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, a->unitNum);
+            KernelSqrtV2 << <blocks, threads >> >((__half*)a->data, (__half*)b->data, a->unitNum);
        }
        else if (p != (DTYPE)1.0) {
            ShowNTErrors("TODO!");

--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
@@ -30,15 +30,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set all entries to its root (CUDA Kernel) */
 __global__
-void KernelSqrtV2(DTYPE * d, int size);
+void KernelSqrtV2(DTYPE * a, DTYPE * b, int size);

 /* set all entries to its root (CUDA Kernel) */
 __global__
-void KernelSqrtV2(__half * d, int size);
+void KernelSqrtV2(__half * a, __half * b, int size);

 /* get the power of the entries */
 extern "C"
-void _CudaPower(XTensor * a, DTYPE p);
+void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);

 #endif // USE_CUDA


--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -27,8 +27,19 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* get the power(x, y) */
-extern "C"
-void _Power(XTensor * a, DTYPE p);
+void _Power(const XTensor * a, XTensor * b, DTYPE p);
+
+/* 
+get the power(x, y) (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _PowerMe(XTensor * a, DTYPE p);
+
+/* 
+get the power(x, y) (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Power(const XTensor & a, DTYPE p);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -110,8 +110,7 @@ make a new tensor to keep the result and return it
 */
 XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {
-    CheckNTErrors(&s, "Empty input tensor!");
-    CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
+    CheckNTErrors(dim >= 0 && dim < s.order, "A too larget dimension specified!");

    int order = s.order;
    int * dimSize = new int[order];
@@ -123,16 +122,13 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
            dimSize[i] = s.dimSize[i];
    }
    
-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
-    t.SetZeroAll();
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMP();

    /* call _CopyIndexed function */
    _CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);

-    /* destroy variables */
-    delete[] dimSize;
-    
    /* tensor connection */
    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
    XLink::AddParamToHeadInt(&t, dim);
@@ -140,7 +136,10 @@ XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, in
    XLink::AddParamToHeadInt(&t, indexSize);
    XLink::AddParamToHeadPointer(&t, tgtIndex);
    XLink::AddParamToHeadInt(&t, copyNum);
-
+    
+    /* destroy variables */
+    delete[] dimSize;
+    
    return t;
 }


--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -101,32 +101,31 @@ make a new tensor to keep the result and return it
 */
 XTensor ReduceMax(const XTensor &input, int dim)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
 	
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
-    output.SetZeroAll();
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceMax function */
    _ReduceMax(&input, &output, dim);
-
-    /* destroy variables */
-    delete[] dimSize;
-
+    
    /* tensor connection */
    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
    XLink::AddParamToHeadInt(&output, dim);

+    /* destroy variables */
+    delete[] dimSize;
+
    return output;
 }


--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -58,20 +58,19 @@ For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
 XTensor ReduceMean(const XTensor &input, int dim)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
 	
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
-    output.SetZeroAll();
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceMean function */
@@ -87,4 +86,4 @@ XTensor ReduceMean(const XTensor &input, int dim)
    return output;
 }

-} // namespace nts(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -214,20 +214,19 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 */
 XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
 	
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
-    output.SetZeroAll();
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceSum function */
@@ -237,6 +236,53 @@ XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE pow
    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
    XLink::AddParamToHeadInt(&output, dim);
    XLink::AddParamToHead(&output, power);
+    XLink::AddParamToHeadBool(&output, isExp);
+
+    /* destroy variables */
+    delete[] dimSize;
+
+    return output;
+}
+
+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+For a 1-dimensional data array a,
+sum = \sum_i (a_i)^power if isExp == false
+sum = \sum_i exp((a_i)^power) if isExp == true
+
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+>> ieExp - specify if the exp() is performed
+>> power - we perform pow(item_i, power) on each item in the array
+<< return - the sum along a dimension of the tensor
+*/
+XTensor ReduceSum(const XTensor &input, int dim, DTYPE power, bool isExp)
+{
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i >= dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
+    output.SetTMP();
+
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, NULL, power, isExp);
+            
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCESUM);
+    XLink::AddParamToHeadInt(&output, dim);
+    XLink::AddParamToHead(&output, power);
+    XLink::AddParamToHeadBool(&output, isExp);

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -43,7 +43,16 @@ For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
-XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
+XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power = (DTYPE)1.0F, bool isExp = false);
+
+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a,
+sum = \sum_i (a_i) if isExp == false
+sum = \sum_i exp(a_i) if isExp == true
+*/
+XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool isExp = false);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -54,25 +54,24 @@ For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
 */
 XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
 	
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
-    output.SetZeroAll();
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceSumSquared function */
    _ReduceSumSquared(&input, &output, dim, &shift);
-            
+                    
    /* tensor connection */
    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
    XLink::AddParamToHeadInt(&output, dim);
@@ -83,4 +82,4 @@ XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
    return output;
 }

- } // namespace nts(NiuTrans.Tensor)
+ } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

+#include "../../XName.h"
 #include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceVariance.h"
@@ -56,24 +57,27 @@ For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
 {
-    CheckNTErrors(&input, "Empty input or output tensors!");
-    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+    CheckNTErrors(dim >= 0 && dim < input.order, "Illegal dimension to reduce!");
 	
    int order = input.order - 1;
    int * dimSize = new int[order];
-    for(int i = 0; i < input.order; i++){
+    for(int i = 0; i < order; i++){
        if(i < dim)
            dimSize[i] = input.dimSize[i];
-        else if(i > dim)
+        else if(i >= dim)
            dimSize[i] = input.dimSize[i + 1];
    }

-    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
-    output.SetZeroAll();
+    float dr = (!input.isSparse) ? 1.0F : input.denseRatio;
+    XTensor output(order, dimSize, input.dataType, dr, input.devID, input.mem);
    output.SetTMP();

    /* call _ReduceVariance function */
    _ReduceVariance(&input, &output, dim, &mean);
+                
+    /* tensor connection */
+    XLink::MakeLink(&input, &mean, &output, REDUCE_REDUCEVARIANCE);
+    XLink::AddParamToHeadInt(&output, dim);

    /* destroy variables */
    delete[] dimSize;

--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -36,7 +36,7 @@ concatenate a list of tensors along a given dimension
 */
 void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
 {
-    CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
+    CheckNTErrors(big->order > dim && dim >= 0, "Illegal dimension to concatenate!");

    int catDimSize = 0;
    int dimRDI = big->order - dim - 1;

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
@@ -36,7 +36,7 @@ set target data block index for the data movement in merge
 >> mem - the memory pool
 */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                         int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
+                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
 {
    if (mem != NULL && mem->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
@@ -40,7 +40,7 @@ set target data block index for the data movement in split (device code)
 */
 __global__
 void KernelMakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-        int splitSizeInGrid, int gridSize, int gridNum)
+                               int splitSizeInGrid, int gridSize, int gridNum)
 {
    /* block index */
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -71,8 +71,8 @@ set target data block index for the data movement in split
 */
 extern "C"
 void _CudaMakeMergeBlockIndex(int devID,
-                             int * blockIndex, int blockNum, int blockNumInMerge,
-                             int splitSizeInGrid, int gridSize, int gridNum)
+                              int * blockIndex, int blockNum, int blockNumInMerge,
+                              int splitSizeInGrid, int gridSize, int gridNum)
 {
    int cudaGrids[3];
    int cudaBlocks[3];

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
@@ -30,9 +30,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
 extern "C"
-void _CudaMakeMergeBlockIndex(int devID,
-    int * blockIndex, int blockNum, int blockNumInMerge,
-    int splitSizeInGrid, int gridSize, int gridNum);
+void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
+                              int splitSizeInGrid, int gridSize, int gridNum);

 #endif // USE_CUDA


--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set target data block index for the data movement in merge */
 extern "C"
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-    int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
+                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -161,8 +161,7 @@ e.g., (N/3, M, 3) -> (N, M)
 */
 XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
 {
-    CheckNTErrors(&s != NULL, "Invalid tensors!");
-    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
+    CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
    
    if (leadingDim < 0)
 		leadingDim = 0;
@@ -180,8 +179,8 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
        }
    }

-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
-    t.SetZeroAll();
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMP();

    /* call _Merge function */
@@ -314,6 +313,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)

 /*
 merge small tensors into a big tensor (return a XTensor structure)
+make a new tensor to keep the result and return it

 >> smalls - the list of the small tensors
 >> whereToMerge - the merging operation is along with which dimension
@@ -331,15 +331,13 @@ XTensor Merge(const XList &smalls, int whereToMerge)
            dimSize[i] = tensor->dimSize[whereToMerge] * smalls.count;
    }

-    XTensor big = NewTensor(order, dimSize, 
-                            tensor->dataType, tensor->denseRatio, 
-                            tensor->devID, tensor->mem);
-    big.SetZeroAll();
+    float dr = (!tensor->isSparse) ? 1.0F : tensor->denseRatio;
+    XTensor big(order, dimSize, tensor->dataType, dr, tensor->devID, tensor->mem);
    big.SetTMP();

    /* call _Merge function */
    _Merge(&smalls, &big, whereToMerge);
-
+    
    /* tensor connections */
    XLink::MakeLink(&smalls, &big, SHAPE_MERGE_LIST);
    XLink::AddParamToHeadInt(&big, whereToMerge);
@@ -370,10 +368,8 @@ XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
            dimSize[i] = smallA.dimSize[whereToMerge] * 2;
    }

-    XTensor big = NewTensor(order, dimSize, 
-                            smallA.dataType, smallA.denseRatio, 
-                            smallA.devID, smallA.mem);
-    big.SetZeroAll();
+    float dr = (!smallA.isSparse) ? 1.0F : smallA.denseRatio;
+    XTensor big(order, dimSize, smallA.dataType, dr, smallA.devID, smallA.mem);
    big.SetTMP();

    XList smalls(2);

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -148,21 +148,21 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
    int order = s.order + 1;
    int * dimSize = new int[order];

+    dimSize[0] = splitNum;
    for (int i = 0; i < s.order; i++) {
        if (i == whereToSplit)
-            dimSize[i + 1] = s.dimSize[i] / splitNum;
+            dimSize[i+1] = s.dimSize[i] / splitNum;
        else
-            dimSize[i + 1] = s.dimSize[i];
+            dimSize[i+1] = s.dimSize[i];
    }
-    dimSize[0] = splitNum;
-
-    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
-    t.SetZeroAll();
+    
+    float dr = (!s.isSparse) ? 1.0F : s.denseRatio;
+    XTensor t(order, dimSize, s.dataType, dr, s.devID, s.mem);
    t.SetTMP();

    /* call _Split function */
    _Split(&s, &t, whereToSplit, splitNum);
-
+        
    /* tensor connections */
    XLink::MakeLink(&s, NULL, &t, SHAPE_SPLIT);
    XLink::AddParamToHeadInt(&t, whereToSplit);
@@ -175,7 +175,7 @@ XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
 }

 /*
-split a big tensor into small tensors.
+split a big tensor into small tensors

 >> big - the source tensor
 >> smalls - the list that keeps the resulting tensors (for return)
@@ -281,41 +281,19 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
 }

 /*
-split a big tensor into small tensors (returna a XList struture).
-make a new list to keep the result and return it.
+split a big tensor into small tensors

 >> big - the source tensor
+>> smalls - the list that keeps the resulting tensors (for return)
+   NOTE that all the "small" tensors have already been placed in the list in advance.
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
-<< return - a list of small tensors by splitting a big tensor
 */
-XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
+void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
 {
-    CheckNTErrors(&big, "Invalid tensors!");
-    
-    XList smalls = XList(splitNum);
-    int order = big.order;
-    int * dimSize = new int[order];
-    
-    for (int i = 0; i < big.order; i++) {
-        if (i != whereToSplit)
-            dimSize[i] = big.dimSize[i];
-        else
-            dimSize[i] = big.dimSize[i] / splitNum;
-    }
-
-    for (int i = 0; i < splitNum; i++) {
-        XTensor tensor = NewTensor(order, dimSize, 
-                                   big.dataType, big.denseRatio, 
-                                   big.devID, big.mem);
-        tensor.SetZeroAll();
-        tensor.SetTMP();
-        smalls.Add(&tensor);
-    }
-
    /* call _Split function */
    _Split(&big, &smalls, whereToSplit, splitNum);
-
+            
    /* tensor connections */
    for(int i = 0; i < smalls.count; i++){
        XTensor * s = (XTensor*)smalls.Get(i);
@@ -326,11 +304,6 @@ XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
           block, rather than the total number of splits */
        XLink::AddParamToHeadInt(s, i);
    }
-    
-    /* destroy variables */
-    delete[] dimSize;
-
-    return smalls;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -46,7 +46,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
 split a big tensor into small tensors (return a XList structure)
 make a new list to keep the result and return it
 */
-XList SplitList(const XTensor &big, int whereToSplit, int splitNum);
+void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -108,8 +108,6 @@ make a new tensor to keep the result and return it
 */
 XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
 {
-    CheckNTErrors(&a, "Empty input tensors!");
-
    int order = a.order + 1;
    int * dimSize = new int[order];

@@ -122,8 +120,8 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
            dimSize[i] = a.dimSize[i - 1];
    }

-    XTensor b = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
-    b.SetZeroAll();
+    float dr = (!a.isSparse) ? 1.0F : a.denseRatio;
+    XTensor b(order, dimSize, a.dataType, dr, a.devID, a.mem);
    b.SetTMP();

    /* call _Unsqueeze function */

--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -29,13 +29,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 sort the tensor along a given dimension
-
->> a - the tensor
+>> a - input tensor
+>> b - output tensor
 >> index - index of the items in the resulting tensor
 >> dim - the dimension along which the sorting is performed
 */
-void _Sort(XTensor * a, XTensor * index, int dim)
+void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
 {
+    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
@@ -46,7 +47,7 @@ void _Sort(XTensor * a, XTensor * index, int dim)

    if (a->devID >= 0) {
 #ifdef USE_CUDA
-        _CudaSortBig(a, a, index, index, dim);
+        _CudaSortBig(a, b, index, index, dim);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif
@@ -64,12 +65,13 @@ void _Sort(XTensor * a, XTensor * index, int dim)

        for (int k = 0; k < blockNum; k++) {
        for (int i = 0; i < stride; i++) {
-                void * data = (char*)a->data + (k * blockSize + i) * a->unitSize;
+                void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
+                void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
                void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);

                /* we sort the data array along "dim" */
                if (a->dataType == X_FLOAT)
-                    XQSort(data, indexData, strideNum, a->unitSize, stride, CompXFloat);
+                    XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
                else {
                    ShowNTErrors("TODO!");
                }
@@ -78,4 +80,40 @@ void _Sort(XTensor * a, XTensor * index, int dim)
    }
 }

+/*
+sort the tensor along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+
+>> a - input tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void _SortMe(XTensor * a, XTensor * index, int dim)
+{
+    _Sort(a, a, index, dim);
+}
+
+/*
+sort the tensor along a given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> a - input tensor
+>> b - output tensor
+>> index - index of the items in the resulting tensor
+>> dim - the dimension along which the sorting is performed
+*/
+void Sort(XTensor & a, XTensor & b, XTensor & index, int dim)
+{
+    /* call _Negate function */
+    _Sort(&a, &b, &index, dim);
+    
+    /* tensor connections */
+    XList list(2);
+    list.Add(&b);
+    list.Add(&index);
+     XLink::MakeLink(&a, &list, SORT_SORT);
+     XLink::AddParamToHeadInt(&b, dim);
+     XLink::AddParamToHeadInt(&index, dim);
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -39,7 +39,7 @@ bitonic sort (for each row in a matrix)
 >> n - row number of the matrix
 */
 template<class T> __global__
-    void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
+void KernelBitonicSort2D(void * data, int j, int k, int m, int n)
 {
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;
@@ -74,7 +74,7 @@ bitonic sort (for each row in a matrix) with index
 >> n - row number of the matrix
 */
 template<class T> __global__
-    void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
+void KernelBitonicSort2D(void * data, int * index, int j, int k, int m, int n)
 {
    const unsigned int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const unsigned int row = blockDim.y * blockIdx.y + threadIdx.y;

--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -27,8 +27,20 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* sort the data along a given dimension */
+void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim);
+
+/* 
+sort the data along a given dimension (do it on site)
+keep the result in the input tensor a and return nothing
+*/
+void _SortMe(XTensor * a, XTensor * index, int dim);
+
+/* 
+sort the data along a given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
 extern "C"
-void _Sort(XTensor * a, XTensor * index, int dim);
+void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -105,4 +105,29 @@ void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        }
    }
 }
+
+/*
+get the top-k items along a given dimension
+>> a - input tensor
+>> b - output tensor (top-k result)
+>> index - index of the top-k items
+>> dim - the dimension along which the sorting is performed
+>> k - how many items returned after sorting
+*/
+void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k)
+{
+    _TopK(&a, &b, &index, dim, k);
+
+    /* tensor connection */
+    XList list(2);
+    list.Add(&b);
+    list.Add(&index);
+    XLink::MakeLink(&a, &list, SORT_TOPK);
+    XLink::AddParamToHeadInt(&b, dim);
+    XLink::AddParamToHeadInt(&index, k);
+    XLink::AddParamToHeadInt(&b, dim);
+    XLink::AddParamToHeadInt(&index, k);
+}
+
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
@@ -30,6 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 extern "C"
 void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);

+/* get the top-k items along a given dimension */
+extern "C"
+void TopK(XTensor &a, XTensor &b, XTensor &index, int dim, int k);
+
 } // namespace nts(NiuTrans.Tensor)

 #endif // __TOPK_H__
\ No newline at end of file
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -60,7 +60,9 @@ void _HardTanH(const XTensor * x, XTensor * y)
 }

 /* 
-hard tanh function (return a structure) 
+hard tanh function (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
 y =  1    if x > 1
     x    if -1 <= x <= 1
    -1    if x < -1
@@ -72,8 +74,10 @@ XTensor HardTanH(const XTensor &x)
    XTensor y(&x);
    y.SetTMP();

+    /* call _HardTanH function */
    _HardTanH(&x, &y);

+    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_HARDTANH);

    return y;
@@ -116,7 +120,7 @@ void _HardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,
    {
        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;
        DTYPE * dedxp = (DTYPE*)dedx->data;

--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -137,7 +137,7 @@ void _CudaHardTanHBackward(XTensor * gold, XTensor * y, XTensor * x,

        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        int gridSize[3], blockSize[3];


--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -37,7 +37,7 @@ y =  1    if x > 1
 */
 void _HardTanH(const XTensor * x, XTensor * y);

-/* hard tanh function (return a structure) */
+/* hard tanh function (return a XTensor structure) */
 XTensor HardTanH(const XTensor &x);

 /* de/dx */

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-27
 */

+#include "../XName.h"
 #include "Identity.h"
 #include "../XUtility.h"
 #include "../core/movement/CopyValues.h"
@@ -36,6 +37,26 @@ void _Identity(const XTensor * x, XTensor * y)
 }

 /* 
+identity function y = x (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
+>> x - input tensor
+<< return - y
+*/
+XTensor Identity(const XTensor &x)
+{
+    XTensor y(&x);
+    y.SetTMP();
+
+    /* call _Identity function */
+    _Identity(&x, &y);
+
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_IDENTITY);
+
+    return y;
+}
+/* 
 backward computation for identity function y = x 

 dE/dx = dE/dy * dy/dx = dE/dy
@@ -58,7 +79,7 @@ void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
    {
        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        if(dedy->data != dedx->data)
            _CopyValues(dedy, dedx);

--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -28,11 +28,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* identity function y = x */
-extern "C"
 void _Identity(const XTensor * x, XTensor * y);

+/* identity function y = x (return a XTensor structure) */
+XTensor Identity(const XTensor &x);
+
 /* de/dx */
-extern "C"
 void _IdentityBackward(XTensor * gold, XTensor * y, XTensor * x, 
                       XTensor * dedy, XTensor * dedx,
                       LOSS_FUNCTION_NAME lossName);

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -161,19 +161,23 @@ void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
        ShowNTErrors("TODO!");
 }

-/* 
-log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a structure) 
+/*
+log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
 >> x - input vector
 >> leadDim - leading dimension (along which we perform reduction)
-<< return - result
+<< return - y
 */
 XTensor LogSoftmax(const XTensor &x, int leadDim)
 {
    XTensor y(&x);
    y.SetTMP();

+    /* call _LogSoftmax function */
    _LogSoftmax(&x, &y, leadDim);

+    /* tensor connection */
    XLink::MakeLink(&x, NULL, &y, FUNC_LOGSOFTMAX);
    XLink::AddParamToHeadInt(&y, leadDim);


--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) */
 void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);

-/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a structure) */
+/* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (return a XTensor structure) */
 XTensor LogSoftmax(const XTensor &x, int leadDim);

 /* de/dx */

--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
@@ -42,7 +42,7 @@ compute the loss
 >> oBeg - where to start in the model output (along the leading dimension)
 << return - error in model prediction with respect to gold standard
 */
-DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
+DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
                  bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg)
 {
    DTYPE error = 0.0F;
@@ -66,7 +66,7 @@ DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
        blockNum = output->unitNum / blockSize;

        if(isLogOutput)
-            return LossComputeForLogScale(gold, output, LFName, leadDim, gBeg, gLen, oBeg);
+            return _LossComputeForLogScale(gold, output, LFName, leadDim, gBeg, gLen, oBeg);

        DTYPE * gp = (DTYPE*)gold->data;
        DTYPE * op = (DTYPE*)output->data;
@@ -180,7 +180,7 @@ DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
    }
    else {
 #ifdef USE_CUDA
-        error = CudaLossCompute(gold, output, LFName, isLogOutput, leadDim, gBeg, gLen, oBeg);
+        error = _CudaLossCompute(gold, output, LFName, isLogOutput, leadDim, gBeg, gLen, oBeg);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -201,7 +201,7 @@ the log version of loss computation
 >> oBeg - where to start in the model output (along the leading dimension)
 << return - error in model prediction with respect to gold standard
 */
-DTYPE LossComputeForLogScale(XTensor * gold, XTensor * output, 
+DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, 
                             LOSS_FUNCTION_NAME LFName,
                             int leadDim, int gBeg, int gLen, int oBeg)
 {
@@ -343,7 +343,7 @@ with respect to gold standard, and y this the model output
 >> LFName - name of loss function
 << return dE/dy
 */
-DTYPE LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
+DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
 {
    /* 
    squared error 
@@ -380,7 +380,7 @@ with respect to gold standard, and y this the model output
 >> tLen - segment length from tBeg (along the leading dimension)
 >> yBeg - where to start in the model output (along the leading dimension)
 */
-void LossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
+void _LossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
                  LOSS_FUNCTION_NAME LFName, 
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
@@ -496,7 +496,7 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
    }
    else {
 #ifdef USE_CUDA
-        CudaLossBackward(dedy, t, y, LFName, leadDim, tBeg, tLen, yBeg);
+        _CudaLossBackward(dedy, t, y, LFName, leadDim, tBeg, tLen, yBeg);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -51,7 +51,7 @@ compute the loss
 >> yBeg - where to start in the model output (along the leading dimension)
 << return - error in model prediction with respect to gold standard
 */
-DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
+DTYPE _CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
                      bool isLogOutput, int leadDim, int gBeg, int gLen, int yBeg)
 {
    CheckNTErrors((gLen >= 0 && gLen <= y->unitNum), "Illegal input length!");
@@ -65,7 +65,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    CheckNTErrors((gLen == gold->dimSize[leadDim] && gBeg == 0 && yBeg == 0), "TODO!");

    if(isLogOutput)
-        return LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);
+        return _LossComputeForLogScale(gold, y, LFName, leadDim, gBeg, gLen, yBeg);

    DTYPE error = 0.0F;

@@ -77,7 +77,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    if(LFName == SQUAREDERROR){
        XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
        _Sum(gold, y, diff, -1.0F);
-        _Power(diff, 2.0F);
+        _PowerMe(diff, 2.0F);
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);

        int reduceTimes = diff->order;
@@ -110,9 +110,9 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    if(LFName == CROSSENTROPY){
        XTensor * diff = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
        _CopyValues(y, diff);
-        _Log(diff);
+        _LogMe(diff);
        _Multiply(gold, diff, diff);
-        _Negate(diff);
+        _NegateMe(diff);

        int reduceTimes = diff->order;
        for (int i = 0; i < reduceTimes; i++) {
@@ -148,7 +148,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
        _CopyValues(y, yOnehot);
        _Multiply(gold, y, yOnehot);
        _Sum(gold, yOnehot, diff, -1.0F);
-        _Power(diff, 2.0F);
+        _PowerMe(diff, 2.0F);
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);

        int reduceTimes = diff->order;
@@ -190,7 +190,7 @@ the log version of loss computation
 >> yBeg - where to start in the model output (along the leading dimension)
 << return - error in model prediction with respect to gold standard
 */
-DTYPE CudaLossComputeForLogScale(XTensor * gold, XTensor * y, 
+DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * y, 
                                 LOSS_FUNCTION_NAME LFName,
                                 int leadDim, int gBeg, int gLen, int yBeg)
 {
@@ -209,9 +209,9 @@ with respect to gold standard, and y this the model output
 >> LFName - name of loss function
 << return dE/dy
 */
-DTYPE CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
+DTYPE _CudaLossBackward(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName)
 {
-    return LossBackwardPoint(t, y, LFName);
+    return _LossBackwardPoint(t, y, LFName);
   
    // TODO: call cuda kernels for computing the errors
 }
@@ -328,7 +328,7 @@ with respect to gold standard, and y this the model output
 >> tLen - segment length from oBeg (along the leading dimension)
 >> yBeg - where to start in the model output (along the leading dimension)
 */
-void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
+void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
                      LOSS_FUNCTION_NAME LFName, 
                      int leadDim, int tBeg, int tLen, int yBeg)
 {

--- a/source/tensor/function/Loss.cuh
+++ b/source/tensor/function/Loss.cuh
@@ -31,21 +31,21 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* compute the loss (cuda version) */
 extern "C"
-DTYPE CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
+DTYPE _CudaLossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
                      bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);

 /* compute the loss in log scale (cuda version) */
 extern "C"
-DTYPE CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
+DTYPE _CudaLossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
                                 int leadDim, int gBeg, int gLen, int oBeg);

 /* backward compuation for a single element (cuda version) */
 extern "C"
-DTYPE CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
+DTYPE _CudaLossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);

 /* backward compuation for (dense) vectors (cuda version) */
 extern "C"
-void CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
+void _CudaLossBackward(XTensor * dedy, XTensor * t, XTensor * y, 
                      LOSS_FUNCTION_NAME LFName, 
                      int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);


--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
@@ -48,21 +48,21 @@ loss function to measure the "number" of errors

 /* compute the loss */
 extern "C"
-DTYPE LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
+DTYPE _LossCompute(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
                  bool isLogOutput, int leadDim, int gBeg, int gLen, int oBeg);

 /* compute the loss (log version) */
 extern "C"
-DTYPE LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
+DTYPE _LossComputeForLogScale(XTensor * gold, XTensor * output, LOSS_FUNCTION_NAME LFName,
                             int leadDim, int gBeg, int gLen, int oBeg);

 /* backward compuation for a single element */
 extern "C"
-DTYPE LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);
+DTYPE _LossBackwardPoint(DTYPE t, DTYPE y, LOSS_FUNCTION_NAME LFName);

 /* backward compuation for (dense) vectors */
 extern "C"
-void LossBackward(XTensor * dEdY, XTensor * t, XTensor * y, 
+void _LossBackward(XTensor * dEdY, XTensor * t, XTensor * y, 
                  LOSS_FUNCTION_NAME LFName, 
                  int leadDim = -1, int tBeg = 0, int tLen = -1, int yBeg = 0);


--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

+#include "../XName.h"
 #include "Rectify.h"
 #include "Rectify.cuh"

@@ -55,6 +56,28 @@ void _Rectify(const XTensor * x, XTensor * y)
 }

 /*
+rectify function y = max(0, x) (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
+>> input - input tensor
+<< return - y
+*/
+XTensor Rectify(const XTensor &x)
+{
+    XTensor y(&x);
+    y.SetTMP();
+
+    /* call _Rectify function */
+    _Rectify(&x, &y);
+
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_RECTIFY);
+
+    return y;
+}
+
+
+/*
 backward computation

 dE/dx = dE/dy * dy/dx
@@ -94,7 +117,7 @@ void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x,
    {
        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;
        DTYPE * dedxp = (DTYPE*)dedx->data;

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -134,7 +134,7 @@ void _CudaRectifyBackward(XTensor * gold, XTensor * y, XTensor * x,

        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            CudaLossBackward(dedy, gold, y, lossName);
+            _CudaLossBackward(dedy, gold, y, lossName);
        
        int gridSize[3], blockSize[3];


--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -28,11 +28,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* rectify function y = max(0, x) */
-extern "C"
 void _Rectify(const XTensor * x, XTensor * y);

+/* rectify function y = max(0, x) (return a XTensor structure) */
+XTensor Rectify(const XTensor &x);
+
 /* de/dx */
-extern "C"
 void _RectifyBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx,
                      LOSS_FUNCTION_NAME lossName);

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-25
 */

+#include "../XName.h"
 #include <math.h>
 #include "Sigmoid.h"
 #include "Sigmoid.cuh"
@@ -53,6 +54,27 @@ void _Sigmoid(const XTensor * x, XTensor * y)
 }

 /*
+sigmoid function y = 1/(1+exp(-x)) (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
+>> x - input tensor
+<< return - y
+*/
+XTensor Sigmoid(const XTensor &x)
+{
+    XTensor y(&x);
+    y.SetTMP();
+
+    /* call _Sigmoid function */
+    _Sigmoid(&x, &y);
+
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_SIGMOID);
+
+    return y;
+}
+
+/*
 backward computation

 dE/ds = dE/dy * dy/dx
@@ -86,7 +108,7 @@ void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
    {
        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        DTYPE * dedyp = (DTYPE*)dedy->data;
        DTYPE * dedxp = (DTYPE*)dedx->data;

--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -129,7 +129,7 @@ void _CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x,
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){
        /* calculate dE/dy */
        if(lossName != NOLOSS)
-            LossBackward(dedy, gold, y, lossName);
+            _LossBackward(dedy, gold, y, lossName);

        
        int gridSize[3], blockSize[3];

--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -28,11 +28,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* sigmoid function y = 1/(1+exp(-x)) */
-extern "C"
 void _Sigmoid(const XTensor * x, XTensor * y);

+/* sigmoid function y = 1/(1+exp(-x)) (return a XTensor structure) */
+XTensor Sigmoid(const XTensor &x);
+
 /* de/dx */
-extern "C"
 void _SigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx,
                      LOSS_FUNCTION_NAME lossName);

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -22,6 +22,7 @@
 #include <math.h>
 #include "Softmax.h"
 #include "Softmax.cuh"
+#include "../XName.h"
 #include "../XUtility.h"
 #include "../core/reduce/ReduceSum.h"
 #include "../core/reduce/ReduceMax.h"
@@ -130,6 +131,28 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
 }

 /*
+softmax y = e^x / \sum_{i} e^{x_i} (return a XTensor structure) 
+make a new tensor to keep the result and return it
+
+>> x - input vector
+>> leadDim - leading dimension (along which we perform reduction)
+<< return - y
+*/
+XTensor Softmax(const XTensor &x, int leadDim)
+{
+    XTensor y(&x);
+    y.SetTMP();
+
+    /* call _Softmax function */
+    _Softmax(&x, &y, leadDim);
+
+    /* tensor connection */
+    XLink::MakeLink(&x, NULL, &y, FUNC_SOFTMAX);
+
+    return y;
+}
+
+/*
 backward computation for dense tensors

 dE/dx = dE/dy * dy/dx

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -31,7 +31,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"
-void _CudaSotmax(const XTensor * input, XTensor * output, int leadDim);
+void _CudaSoftmax(const XTensor * input, XTensor * output, int leadDim);

 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -28,11 +28,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* softmax y = e^x / \sum_{i} e^{x_i} */
-extern "C"
 void _Softmax(const XTensor * x, XTensor * y, int leadDim);

+/* softmax y = e^x / \sum_{i} e^{x_i} (return a XTensor structure) */
+XTensor Softmax(const XTensor &x, int leadDim);
+
 /* de/dx */
-extern "C"
 void _SoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x, 
                      XTensor * dedy, XTensor * dedx,
                      int leadDim,

--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
@@ -51,15 +51,21 @@ bool TestAbsolute1()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+    aMe->SetData(aData, aUnitNum);

 	/* call Absolute function */
-	_Absolute(a);
+    _Absolute(a, b);
+	_AbsoluteMe(aMe);
+    bUser = Absolute(*a);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -67,25 +73,37 @@ bool TestAbsolute1()

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);

 	/* call Absolute function */
-	_Absolute(aGPU);
+    _Absolute(aGPU, bGPU);
+	_AbsoluteMe(aMeGPU);
+    bUserGPU = Absolute(*aGPU);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;

--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
@@ -76,6 +76,7 @@ bool TestConcatenate1()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -88,9 +89,10 @@ bool TestConcatenate1()

    /* call Concatenate function */
    _Concatenate(sList, t, 1);
-    
+    tUser = Concatenate(*sList, 1);
+
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -100,6 +102,7 @@ bool TestConcatenate1()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
@@ -115,9 +118,10 @@ bool TestConcatenate1()

 	/* call Concatenate function */
 	_Concatenate(sList, tGPU, 1);
+    tUserGPU = Concatenate(*sList, 1);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete sList;
@@ -201,6 +205,7 @@ bool TestConcatenate2()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -213,9 +218,10 @@ bool TestConcatenate2()

    /* call Concatenate function */
    _Concatenate(sList, t, 0);
+    tUser = Concatenate(*sList, 0);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -225,6 +231,7 @@ bool TestConcatenate2()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
@@ -240,9 +247,10 @@ bool TestConcatenate2()

 	/* call Concatenate function */
 	_Concatenate(sList, tGPU, 0);
+    tUserGPU = Concatenate(*sList, 0);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

 	/* destroy variables */
    delete sList;
@@ -324,6 +332,7 @@ bool TestConcatenate3()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -336,9 +345,10 @@ bool TestConcatenate3()

    /* call Concatenate function */
    _Concatenate(sList, t, 1);
+    tUser = Concatenate(*sList, 1);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -348,6 +358,7 @@ bool TestConcatenate3()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
@@ -363,9 +374,10 @@ bool TestConcatenate3()

 	/* call Concatenate function */
 	_Concatenate(sList, tGPU, 1);
+    tUserGPU = Concatenate(*sList, 1);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

 	/* destroy variables */
    delete sList;
@@ -444,6 +456,7 @@ bool TestConcatenate4()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -452,9 +465,10 @@ bool TestConcatenate4()

    /* call Concatenate function */
    _Concatenate(s1, s2, t, 1);
+    tUser = Concatenate(*s1, *s2, 1);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -464,6 +478,7 @@ bool TestConcatenate4()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
@@ -472,9 +487,10 @@ bool TestConcatenate4()

 	/* call Concatenate function */
 	_Concatenate(sGPU1, sGPU2, tGPU, 1);
+    tUserGPU = Concatenate(*sGPU1, *sGPU2, 1);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

 	/* destroy variables */
    delete s1;

--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
@@ -53,6 +53,7 @@ bool TestHardTanH1()
 	/* create tensors */
 	XTensor * x = NewTensor(order, dimSize);
 	XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;

 	/* initialize variables */
 	x->SetData(xData, unitNum);
@@ -60,9 +61,10 @@ bool TestHardTanH1()

 	/* call hardtanh function */
 	_HardTanH(x, y);
+    yUser = HardTanH(*x);

 	/* check results */
-	cpuTest = y->CheckData(answer, unitNum, 1e-4F);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -71,6 +73,7 @@ bool TestHardTanH1()
 	/* create tensor */
 	XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
 	XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;

 	/* Initialize variables */
 	xGPU->SetData(xData, unitNum);
@@ -78,9 +81,10 @@ bool TestHardTanH1()

 	/* call hardtanh function */
 	_HardTanH(xGPU, yGPU);
+    yUserGPU = HardTanH(*xGPU);

 	/* check results */
-	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);

 	/* destroy variables */
 	delete x;

--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
@@ -51,6 +51,7 @@ bool TestIdentity1()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;

    /* initialize variables */
    x->SetData(xData, unitNum);
@@ -58,9 +59,10 @@ bool TestIdentity1()

    /* call Identity function */
    _Identity(x, y);
+    yUser = Identity(*x);
    
    /* check result */
-    cpuTest = y->CheckData(answer, unitNum);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -69,6 +71,7 @@ bool TestIdentity1()
    /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
@@ -76,9 +79,10 @@ bool TestIdentity1()

    /* call Identity function */
    _Identity(xGPU, yGPU);
+    yUserGPU = Identity(*xGPU);
    
    /* check result */
-    gpuTest = yGPU->CheckData(answer, unitNum);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
@@ -51,15 +51,21 @@ bool TestLog1()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);

 	/* call Log function */
-	_Log(a);
+	_Log(a, b);
+	_LogMe(aMe);
+    bUser = Log(*a);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -67,25 +73,37 @@ bool TestLog1()

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);

 	/* call Log function */
-	_Log(aGPU);
+    _Log(aGPU, bGPU);
+	_LogMe(aMeGPU);
+    bUserGPU = Log(*aGPU);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;

--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -51,6 +51,7 @@ bool TestLogSoftmax1()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;

    /* initialize variables */
    x->SetData(xData, unitNum);
@@ -58,9 +59,10 @@ bool TestLogSoftmax1()

    /* call LogSoftmax function */
    _LogSoftmax(x, y, 1);
+    yUser = LogSoftmax(*x, 1);
    
    /* check result */
-    cpuTest = y->CheckData(answer, unitNum, 1e-4F);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -69,6 +71,7 @@ bool TestLogSoftmax1()
    /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
@@ -76,9 +79,10 @@ bool TestLogSoftmax1()

    /* call LogSoftmax function */
    _LogSoftmax(xGPU, yGPU, 1);
+    yUserGPU = LogSoftmax(*xGPU, 1);
    
    /* check result */
-    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TLoss.cpp
+++ b/source/tensor/test/TLoss.cpp
@@ -46,6 +46,7 @@ bool TestLoss1()
    bool cpuTest = true;

    DTYPE answer = 5.0F;
+    DTYPE error;

    /* create tensors */
    XTensor * output = NewTensor(order, dimSize);
@@ -57,8 +58,8 @@ bool TestLoss1()
    _ScaleAndShiftMe(output, 1, 1);
    _ScaleAndShiftMe(gold, 1, 2);

-    DTYPE error;
-    error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    /* call LossCompute function */
+    error = _LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    cpuTest = (error == answer);
@@ -78,7 +79,7 @@ bool TestLoss1()
    _ScaleAndShiftMe(goldGPU, 1, 2);

    /* call LossCompute function */
-    error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    error = _LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    gpuTest = (error == answer);
@@ -123,6 +124,7 @@ bool TestLoss2()
    bool cpuTest = true;

    DTYPE answer = 0.0F;
+    DTYPE error;

    /* create tensors */
    XTensor * output = NewTensor(order, dimSize);
@@ -134,8 +136,8 @@ bool TestLoss2()
    _ScaleAndShiftMe(output, 1, 1);
    _ScaleAndShiftMe(gold, 1, 2);

-    DTYPE error;
-    error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    /* call LossCompute function */
+    error = _LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    cpuTest = (error == answer);
@@ -155,7 +157,7 @@ bool TestLoss2()
    _ScaleAndShiftMe(goldGPU, 1, 2);

    /* call LossCompute function */
-    error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    error = _LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    gpuTest = (error == answer);
@@ -210,6 +212,7 @@ bool TestLoss3()
    bool cpuTest = true;

    DTYPE answer = 0.25F;
+    DTYPE error;

    /* create tensors */
    XTensor * output = NewTensor(order, dimSize);
@@ -219,8 +222,8 @@ bool TestLoss3()
    output->SetData(outputData, unitNum);
    gold->SetData(goldData, unitNum);

-    DTYPE error;
-    error = LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
+    /* call LossCompute function */
+    error = _LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    cpuTest = (error == answer);
@@ -238,7 +241,7 @@ bool TestLoss3()
    goldGPU->SetData(goldData, unitNum);

    /* call LossCompute function */
-    error = LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
+    error = _LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
    
    /* check results */
    gpuTest = (error == answer);

--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
@@ -75,6 +75,7 @@ bool TestMatrixMulBatched1()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -83,9 +84,10 @@ bool TestMatrixMulBatched1()

    /* call MatrixMulBatched function */
    _MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    tUser = MatrixMulBatched(*s1, X_NOTRANS, *s2, X_NOTRANS);
    
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -95,6 +97,7 @@ bool TestMatrixMulBatched1()
    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* Initialize variables */
    sGPU1->SetData(sData1, sUnitNum1);
@@ -103,9 +106,10 @@ bool TestMatrixMulBatched1()
    
    /* call MatrixMulBatched function */
    _MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    tUserGPU = MatrixMulBatched(*sGPU1, X_NOTRANS, *sGPU2, X_NOTRANS);
    
    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
    
    /* destroy variables */
    delete s1;
@@ -193,6 +197,7 @@ bool TestMatrixMulBatched2()
    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum1);
@@ -201,10 +206,11 @@ bool TestMatrixMulBatched2()

    /* call MatrixMulBatched function */
    _MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    tUser = MatrixMulBatched(*s1, X_NOTRANS, *s2, X_NOTRANS);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
-    
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
+
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
@@ -213,6 +219,7 @@ bool TestMatrixMulBatched2()
    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* Initialize variables */
    sGPU1->SetData(sData1, sUnitNum1);
@@ -221,9 +228,10 @@ bool TestMatrixMulBatched2()

    /* call MatrixMulBatched function */
    _MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    tUserGPU = MatrixMulBatched(*sGPU1, X_NOTRANS, *sGPU2, X_NOTRANS);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s1;

--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
@@ -60,16 +60,18 @@ bool TestMerge1()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

-    /* call merge function */
+    /* call Merge function */
    _Merge(s, t, 1, 0);
+    tUser = Merge(*s, 1, 0);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -78,16 +80,18 @@ bool TestMerge1()
    /* create tensor */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

-    /* call merge function */
+    /* call Merge function */
    _Merge(sGPU, tGPU, 1, 0);
+    tUserGPU = Merge(*sGPU, 1, 0);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;
@@ -166,18 +170,23 @@ bool TestMerge2()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
+    XTensor tUser1;
+    XTensor tUser2;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t1->SetZeroAll();
    t2->SetZeroAll();

-    /* call merge function */
+    /* call Merge function */
    _Merge(s, t1, 1, 0);
    _Merge(s, t2, 2, 0);
+    tUser1 = Merge(*s, 1, 0);
+    tUser2 = Merge(*s, 2, 0);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
+        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */
@@ -187,18 +196,23 @@ bool TestMerge2()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU1;
+    XTensor tUserGPU2;

    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU1->SetZeroAll();
    tGPU2->SetZeroAll();

-    /* call merge function */
+    /* call Merge function */
    _Merge(sGPU, tGPU1, 1, 0);
    _Merge(sGPU, tGPU2, 2, 0);
+    tUserGPU1 = Merge(*sGPU, 1, 0);
+    tUserGPU2 = Merge(*sGPU, 2, 0);

    /* check results */
-    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
+        && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);

    /* destroy variables */
    delete s;
@@ -271,6 +285,7 @@ bool TestMerge3()
    XTensor * s1 = NewTensor(sOrder, sDimSize);
    XTensor * s2 = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum);
@@ -281,11 +296,12 @@ bool TestMerge3()
    smallList->Add(s1);
    smallList->Add(s2);

-    /* call merge function */
+    /* call Merge function */
    _Merge(smallList, t, 0);
+    tUser = Merge(*smallList, 0);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -298,6 +314,7 @@ bool TestMerge3()
    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU1->SetData(sData1, sUnitNum);
@@ -308,11 +325,12 @@ bool TestMerge3()
    smallList->Add(sGPU1);
    smallList->Add(sGPU2);

-    /* call merge function */
+    /* call Merge function */
    _Merge(smallList, tGPU, 0);
+    tUserGPU = Merge(*smallList, 0);

    /* check results */
-    cpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s1;
@@ -383,6 +401,7 @@ bool TestMerge4()
    XTensor * s1 = NewTensor(sOrder, sDimSize);
    XTensor * s2 = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s1->SetData(sData1, sUnitNum);
@@ -393,11 +412,12 @@ bool TestMerge4()
    smallList->Add(s1);
    smallList->Add(s2);

-    /* call merge function */
+    /* call Merge function */
    _Merge(smallList, t, 1);
+    tUser = Merge(*smallList, 1);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -410,6 +430,7 @@ bool TestMerge4()
    XTensor * sGPU1 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * sGPU2 = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU1->SetData(sData1, sUnitNum);
@@ -420,11 +441,12 @@ bool TestMerge4()
    smallList->Add(sGPU1);
    smallList->Add(sGPU2);

-    /* call merge function */
+    /* call Merge function */
    _Merge(smallList, tGPU, 1);
+    tUserGPU = Merge(*smallList, 1);

    /* check results */
-    cpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s1;

--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
@@ -48,15 +48,21 @@ bool TestNegate1()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);

 	/* call Negate function */
-	_Negate(a);
+	_Negate(a, b);
+	_NegateMe(aMe);
+    bUser = Negate(*a);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
    
 #ifdef USE_CUDA
 	/* GPU test */
@@ -64,25 +70,37 @@ bool TestNegate1()

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);

 	/* call Negate function */
-	_Negate(aGPU);
+    _Negate(aGPU, bGPU);
+	_NegateMe(aMeGPU);
+    bUserGPU = Negate(*aGPU);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;
@@ -114,41 +132,59 @@ bool TestNegate2()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);

 	/* call Negate function */
-	_Negate(a);
+	_Negate(a, b);
+	_NegateMe(aMe);
+    bUser = Negate(*a);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
-
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+	aMeGPU->SetData(aData, aUnitNum);

 	/* call Negate function */
-	_Negate(aGPU);
+    _Negate(aGPU, bGPU);
+	_NegateMe(aMeGPU);
+    bUserGPU = Negate(*aGPU);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -52,41 +52,59 @@ bool TestPower1()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+    aMe->SetData(aData, aUnitNum);

 	/* call Power function */
-	_Power(a, 2.0F);
+    _Power(a, b, 2.0F);
+	_PowerMe(aMe, 2.0F);
+    bUser = Power(*a, 2.0F);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
-
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);

 	/* call power function */
-	_Power(aGPU, 2.0F);
+    _Power(aGPU, bGPU, 2.0F);
+	_PowerMe(aMeGPU, 2.0F);
+    bUserGPU = Power(*aGPU, 2.0F);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
-
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);
+    
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;
@@ -121,41 +139,59 @@ bool TestPower2()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+	aMe->SetData(aData, aUnitNum);

 	/* call Power function */
-	_Power(a, 1.0F);
+    _Power(a, b, 1.0F);
+	_PowerMe(aMe, 1.0F);
+    bUser = Power(*a, 1.0F);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
-
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	_Power(aGPU, 1.0F);
+    _Power(aGPU, bGPU, 1.0F);
+	_PowerMe(aMeGPU, 1.0F);
+    bUserGPU = Power(*aGPU, 1.0F);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;
@@ -190,41 +226,59 @@ bool TestPower3()

 	/* create tensors */
 	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+	XTensor * aMe = NewTensor(aOrder, aDimSize);
+    XTensor bUser;

 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
+    aMe->SetData(aData, aUnitNum);

 	/* call Power function */
-	_Power(a, 0.0F);
+    _Power(a, b, 0.0F);
+	_PowerMe(aMe, 0.0F);
+    bUser = Power(*a, 0.0F);

 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
-
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F) && aMe->CheckData(answer, aUnitNum, 1e-4F) && bUser.CheckData(answer, aUnitNum, 1e-4F);
+    
 #ifdef USE_CUDA
 	/* GPU test */
 	bool gpuTest = true;

 	/* create tensor */
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * aMeGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor bUserGPU;

 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
+    aMeGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	_Power(aGPU, 0.0F);
+    _Power(aGPU, bGPU, 0.0F);
+	_PowerMe(aMeGPU, 0.0F);
+    bUserGPU = Power(*aGPU, 0.0F);

 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F) && aMeGPU->CheckData(answer, aUnitNum, 1e-4F) && bUserGPU.CheckData(answer, aUnitNum, 1e-4F);

 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
    delete aGPU;
+    delete bGPU;
+    delete aMeGPU;
 	delete[] aDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
 	delete a;
+	delete b;
+	delete aMe;
 	delete[] aDimSize;

 	return cpuTest;

--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
@@ -50,6 +50,7 @@ bool TestRectify1()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;

    /* initialize variables */
    x->SetData(xData, unitNum);
@@ -57,9 +58,10 @@ bool TestRectify1()

    /* call Rectify function */
    _Rectify(x, y);
+    yUser = Rectify(*x);

    /* check results */
-    cpuTest = y->CheckData(answer, unitNum);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -68,6 +70,7 @@ bool TestRectify1()
 	/* create tensor */
 	XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
 	XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;

 	/* Initialize variables */
 	xGPU->SetData(xData, unitNum);
@@ -75,9 +78,10 @@ bool TestRectify1()

 	/* call Rectify function */
 	_Rectify(xGPU, yGPU);
+    yUserGPU = Rectify(*xGPU);

 	/* check results */
-	gpuTest = yGPU->CheckData(answer, unitNum);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);

 	/* destroy variables */
 	delete x;

--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
@@ -71,6 +71,8 @@ bool TestReduceMax1()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
+    XTensor tUser1;
+    XTensor tUser2;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
@@ -80,9 +82,12 @@ bool TestReduceMax1()
    /* call ReduceMax function */
    _ReduceMax(s, t1, 0);
    _ReduceMax(s, t2, 1);
+    tUser1 = ReduceMax(*s, 0);
+    tUser2 = ReduceMax(*s, 1);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
+        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */
@@ -92,6 +97,8 @@ bool TestReduceMax1()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU1;
+    XTensor tUserGPU2;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
@@ -101,9 +108,12 @@ bool TestReduceMax1()
    /* call ReduceMax function */
    _ReduceMax(sGPU, tGPU1, 0);
    _ReduceMax(sGPU, tGPU2, 1);
+    tUserGPU1 = ReduceMax(*sGPU, 0);
+    tUserGPU2 = ReduceMax(*sGPU, 1);

    /* check results */
-    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
+        && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
@@ -66,6 +66,8 @@ bool TestReduceMean1()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
+    XTensor tUser1;
+    XTensor tUser2;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
@@ -75,9 +77,12 @@ bool TestReduceMean1()
    /* call ReduceMean function */
    _ReduceMean(s, t1, 0);
    _ReduceMean(s, t2, 1);
+    tUser1 = ReduceMean(*s, 0);
+    tUser2 = ReduceMean(*s, 1);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
+        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */
@@ -87,6 +92,8 @@ bool TestReduceMean1()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU1;
+    XTensor tUserGPU2;

    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
@@ -96,9 +103,12 @@ bool TestReduceMean1()
    /* call ReduceMean function */
    _ReduceMean(sGPU, tGPU1, 0);
    _ReduceMean(sGPU, tGPU2, 1);
+    tUserGPU1 = ReduceMean(*sGPU, 0);
+    tUserGPU2 = ReduceMean(*sGPU, 1);

    /* check results */
-    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
+        && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -69,20 +69,29 @@ bool TestReduceSum1()

    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * shift1 = NewTensor(tOrder1, tDimSize1);
+    XTensor * shift2 = NewTensor(tOrder2, tDimSize2);
    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
+    XTensor tUser1;
+    XTensor tUser2;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
+    shift1->SetZeroAll();
+    shift2->SetZeroAll();
    t1->SetZeroAll();
    t2->SetZeroAll();

    /* call ReduceSum function */
    _ReduceSum(s, t1, 0);
    _ReduceSum(s, t2, 1);
+    tUser1 = ReduceSum(*s, 0, *shift1);
+    tUser2 = ReduceSum(*s, 1, *shift2);

    /* check results */
-    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && tUser1.CheckData(answer1, tUnitNum1)
+        && t2->CheckData(answer2, tUnitNum2) && tUser2.CheckData(answer2, tUnitNum2);

 #ifdef USE_CUDA
    /* GPU test */
@@ -90,26 +99,39 @@ bool TestReduceSum1()

    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU1;
+    XTensor tUserGPU2;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
+    shiftGPU1->SetZeroAll();
+    shiftGPU2->SetZeroAll();
    tGPU1->SetZeroAll();
    tGPU2->SetZeroAll();

    /* call ReduceSum function */
    _ReduceSum(sGPU, tGPU1, 0);
    _ReduceSum(sGPU, tGPU2, 1);
+    tUserGPU1 = ReduceSum(*sGPU, 0, *shiftGPU1);
+    tUserGPU2 = ReduceSum(*sGPU, 1, *shiftGPU2);

    /* check results */
-    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tUserGPU1.CheckData(answer1, tUnitNum1)
+        && tGPU2->CheckData(answer2, tUnitNum2) && tUserGPU2.CheckData(answer2, tUnitNum2);

    /* destroy variables */
    delete s;
+    delete shift1;
+    delete shift2;
    delete t1;
    delete t2;
    delete sGPU;
+    delete shiftGPU1;
+    delete shiftGPU2;
    delete tGPU1;
    delete tGPU2;
    delete[] sDimSize;
@@ -120,6 +142,8 @@ bool TestReduceSum1()
 #else
    /* destroy variables */
    delete s;
+    delete shift1;
+    delete shift2;
    delete t1;
    delete t2;
    delete[] sDimSize;

--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
@@ -70,6 +70,7 @@ bool TestReduceSumSquared1()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
@@ -78,9 +79,10 @@ bool TestReduceSumSquared1()

    /* call ReduceSumSquared function */
    _ReduceSumSquared(s, t, 0, shift);
+    tUser = ReduceSumSquared(*s, 0, *shift);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -90,6 +92,7 @@ bool TestReduceSumSquared1()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
@@ -98,9 +101,10 @@ bool TestReduceSumSquared1()

    /* call ReduceSumSquared function */
    _ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
+    tUserGPU = ReduceSumSquared(*sGPU, 0, *shiftGPU);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;
@@ -174,6 +178,7 @@ bool TestReduceSumSquared2()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
@@ -182,9 +187,10 @@ bool TestReduceSumSquared2()

    /* call ReduceSumSquared function */
    _ReduceSumSquared(s, t, 1, shift);
+    tUser = ReduceSumSquared(*s, 1, *shift);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -194,6 +200,7 @@ bool TestReduceSumSquared2()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
@@ -202,9 +209,10 @@ bool TestReduceSumSquared2()

    /* call ReduceSumSquared function */
    _ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);
+    tUserGPU = ReduceSumSquared(*sGPU, 1, *shiftGPU);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
@@ -70,6 +70,7 @@ bool TestReduceVariance1()
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
    XTensor * mean = NewTensor(meanOrder, meanDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
@@ -78,9 +79,10 @@ bool TestReduceVariance1()

    /* call ReduceVariance function */
    _ReduceVariance(s, t, 0, mean);
+    tUser = ReduceVariance(*s, 0, *mean);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -90,6 +92,7 @@ bool TestReduceVariance1()
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
    XTensor * meanGPU = NewTensor(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
@@ -98,9 +101,10 @@ bool TestReduceVariance1()

    /* call ReduceVariance function */
    _ReduceVariance(sGPU, tGPU, 0, meanGPU);
+    tUserGPU = ReduceVariance(*sGPU, 0, *meanGPU);

    /* check results */
-    gpuTest = t->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
@@ -48,6 +48,7 @@ bool TestSigmoid1()
    /* create tensors */
    XTensor * x = NewTensor(order, dimSize);
    XTensor * y = NewTensor(order, dimSize);
+    XTensor yUser;

    /* initialize variables */
    x->SetData(xData, unitNum);
@@ -55,9 +56,10 @@ bool TestSigmoid1()

    /* call Sigmoid function */
    _Sigmoid(x, y);
+    yUser = Sigmoid(*x);

    /* check result */
-    cpuTest = y->CheckData(answer, unitNum, 1e-4F);
+	cpuTest = y->CheckData(answer, unitNum, 1e-4F) && yUser.CheckData(answer, unitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -66,6 +68,7 @@ bool TestSigmoid1()
        /* create tensors */
    XTensor * xGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * yGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor yUserGPU;

    /* initialize variables */
    xGPU->SetData(xData, unitNum);
@@ -73,9 +76,10 @@ bool TestSigmoid1()

    /* call Sigmoid function */
    _Sigmoid(xGPU, yGPU);
+    yUserGPU = Sigmoid(*xGPU);

    /* check result */
-    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
+	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F) && yUserGPU.CheckData(answer, unitNum, 1e-4F);

    /* destroy variables */
    delete x;

--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp