1. redefine the inferences 2. update the test 3. update the manual 4. merged with xiao

394e8340 · xuchen · 9b11391e · 394e8340 · 394e8340 · 394e8340
Commit 394e8340 authored Jul 16, 2018 by xuchen
--- a/doc/manual.md
+++ b/doc/manual.md
@@ -195,26 +195,21 @@ int main(int argc, const char ** argv)
 ## 访问张量中的内容

 在C/C++中，我们通过XTensor.h访问张量中的内容，并且仅需要在源程序中引用XTensor.h头文件就可以完成张量的定义。
+在此部分，我们主要对用户在访问张量内容时涉及到的成员变量及方法进行说明，更详细的说明请见附录。
 在XTensor.h头文件中定义的成员变量说明：

 | 成员变量 | 功能 |
 | - | - |
 | XMem * mem | 张量所使用的内存池 |
 | void * data | 保存元素的数据数组 |
-| void * dataHost | 主机内存上的数据副本，只在GPU上运行时被激活 |
 | int devID | 设备ID，指张量所申请的空间所在CPU或者GPU设备的编号，-1表示CPU |
 | int order | 张量的维度，例如：一个矩阵（维度为2）是一个二维张量 |
 | int dimSize<br> [MAX_TENSOR_DIM_NUM] | 张量中每一维度的大小，索引0表示第1维 |
-| int dimSizeRDI<br> [MAX_TENSOR_DIM_NUM] | 转置模式下张量中每一维度的大小，索引0表示第1维 |
 | TENSOR_DATA_TYPE dataType | 每个数据单元的数据类型 |
 | int unitSize | 数据单元的大小，类似于sizeof() |
 | int unitNum | 数据单元的数量 |
 | bool isSparse | 是否稠密，一个n * m稠密矩阵的数据量大小为n * m,而稀疏（非稠密）矩阵的数据量大小则取决于矩阵中非零元素个数。|
-| int unitNumNonZero | 稀疏矩阵中非零元素个数 |
 | float denseRatio | 稠密度，指非零单元的比例，是介于0和1之间的一个实数，0表示所有单元全为零，1表示全为非零单元。|
-| bool isShared | 标志数据数组是否被其他张量所共享 |
-| bool isInGlobalMem | 标志数据是否在全局内存而不是内存池中 |
-| bool isAllValued<br> [MAX_TENSOR_DIM_NUM] | 标志稀疏矩阵中是否每个维度都具有非零元素 |

 在XTensor.h头文件中定义的方法说明：

@@ -226,33 +221,19 @@ int main(int argc, const char ** argv)
 | 得到张量中给定的维度大小 | int GetDim(const int dim) | dim - 张量的维度 |
 | 重新调整矩阵维度 | void Reshape(<br> const int order, const int * myDimSize) | order - 张量的维度 <br> myDimSize - 张量每一维的大小 |
 | 得到张量中元素数量 | int GetSize() | N/A |
-| 得到内存使用大小 | int GetDataSizeInChar() | N/A |
 | 得到所给数据类型的数据<br> 单元大小 | int GetUnitSize(<br> TENSOR_DATA_TYPE myDataType) | myDataType - 所给数据类型 |
 | 张量中所有元素设置为0 | void SetZeroAll(XStream * stream = NULL) | stream - 多线程流|
 | 用数组赋值张量 | void SetData(<br> const void * d, int num, int beg = 0) | d - 赋值数组  <br> num - 数组大小 <br> beg - 赋值时从张量的第几位开始 |
 | 设置张量服从均匀分布 | void SetDataRand(<br> DTYPE lower, DTYPE upper) | lower - 最小值 <br> upper - 最大值 |
 | 设置张量服从正态分布 | void SetDataRandn(<br> DTYPE mean, DTYPE standardDeviation) | mean - 均值 <br> standardDeviation - 标准差 |
-| 检查张量中元素是否相同 | bool CheckData(<br> const void * answer, int num, int beg = 0) | answer - 给定数组 <br> num - 数组大小 <br> beg - 赋值时从张量的第几位开始 |
 | 将给定维度中元素<br> 设置为升序 | void SetAscendingOrder(int dim) | dim - 给定维度 |
-| 获取张量中元素指针 | void * GetCell(int * index, int size)    | index - 元素位置 <br> size-矩阵大小 |
-| 获取二维张量中元素指针 | void * GetCell2D(int ni, int mi = 0) | ni - 行值 <br> mi - 列值 |
 | 获取二维张量的值 | DTYPE Get2D(int ni, int mi = 0) | ni - 行值 <br> mi - 列值 |
-| 获取稀疏张量的值 | DTYPE GetInSparse(int i) | i - 稀疏矩阵中非0元素位置 |
-| 获取稀疏张量中<br> 元组的键值 | int GetKeyInSparse(int i) | i - 稀疏矩阵中非0元素位置 |
 | 设置二维张量中<br> 的单元值 | bool Set2D(DTYPE value, int ni, int mi = 0) | value - 单元值 <br> ni - 行值 <br> mi - 列值 |
 | 增加二维张量中<br> 的单元值 | bool Add2D(DTYPE value, int ni, int mi = 0) | value - 单元值 <br> ni - 行值 <br> mi - 列值 |
-| 获取稀疏矩阵中<br> 非零元素数量 | int GetNonzeroSize() | N/A |
 | 将矩阵重置为特定大小 | bool Resize(<br> const int myOrder, <br> const int * myDimSize, <br> const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE, <br> const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
-| 将矩阵重置为特定大小<br>并不申请新空间 | bool ResizeWithNoData(<br> const int myOrder, <br> const int * myDimSize, <br> const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE, <br> const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
 | 将矩阵重置为<br> 另一矩阵大小 | bool Resize(<br> const XTensor * myTensor) | myTensor - 重置矩阵大小的参考矩阵 |
-| 用二值搜索方法<br> 找到稀疏矩阵中元素 | bool BinarySearch(<br> int key, DTYPE &value, void * &position) | key - 稀疏矩阵中元素位置 <br> value - 元素值 <br> position - 元素坐标位置 |
-| 将数据刷新到<br> 目标设备中 | void FlushToMem(XMem * targetMem) | targetMem - 目标设备 |
-| 在全局内存中<br> 申请矩阵的内存空间 | static void AllocateData(<br> XTensor * matrix, <br> XMem * myMem = NULL, <br> bool useBuf = false) | matrix - 申请内存空间的矩阵 <br> myMem - 是否在内存池中申请空间 <br> useBuf - 是否使用缓冲区 |
-| 在全局内存中<br> 释放矩阵的内存空间 | static void FreeData(<br> XTensor * matrix, <br> XMem * myMem = NULL, <br> bool useBuf = false) | matrix - 申请内存空间的矩阵 <br> myMem - 是否在内存池中申请空间 <br> useBuf - 是否使用缓冲区 |
-| 在缓冲区创建张量 | XTensor * NewTensorBuf( <br> const int myOrder,  <br> const int * myDimSize, XMem * myMem, <br> const TENSOR_DATA_TYPE myDataType = <br> X_FLOAT, const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myMem - 张量所使用的内存池 <br>  myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
 | 依据给定张量<br>复制一个新的张量 | XTensor * NewTensor(<br>XTensor * a, bool isFilledData = true) | a - 给定张量 <br>  isFilledData - 是否申请张量中的数据空间 |
 | 依据给定张量<br>释放数据空间 | void DelTensor(<br>const XTensor * tensor) | tensor - 给定张量 |
-| 依据给定张量<br>在缓存中释放数据空间 | void DelTensorBuf(<br>const XTensor * tensor) | tensor - 给定张量 |

 ## 张量计算

@@ -1344,8 +1325,8 @@ NiuTrans.Tensor/Tensor/test/TRectify.cpp
 ##### 什么是HardTanH？

 HardTanH是一种激活函数，HardTanH函数定义为：
->y =  1 &nbsp;&nbsp;if x > 1 \
-&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; x &nbsp;&nbsp;if -1 <= x <= 1 \
+>y =  1 &nbsp;&nbsp;if x > 1 <br />
+&nbsp;&nbsp; &nbsp;&nbsp;&nbsp; x &nbsp;&nbsp;if -1 <= x <= 1 <br />
 &nbsp;&nbsp; &nbsp; -1 &nbsp;&nbsp;if x < -1

 ##### HardTanH调用
@@ -1493,10 +1474,10 @@ NiuTrans.Tensor/Tensor/test/TSoftmax.cpp
 ##### 什么是Loss？

 Loss Function(损失函数)是用来衡量神经网络模型效果及优化目标的一种损失函数，函数定义为：
->squared error : loss = sum_{i} 0.5*(gold_i - output_i)^2 \
-cross entropy : loss = sum_{i} (-gold_i * log(output_i)) \
-one hot error : loss = sum_{i} e_i \
-&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; where e_i = 0.5*(t_i - y_i)^2 &nbsp;&nbsp;if t_i = 1, \
+>squared error : loss = sum_{i} 0.5*(gold_i - output_i)^2 <br />
+cross entropy : loss = sum_{i} (-gold_i * log(output_i)) <br />
+one hot error : loss = sum_{i} e_i <br />
+&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; where e_i = 0.5*(t_i - y_i)^2 &nbsp;&nbsp;if t_i = 1, <br />
 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;e_i = 0 &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; otherwise


@@ -1539,3 +1520,64 @@ NiuTrans.Tensor/Tensor/test/TLoss.cpp
 ## 实例3：循环神经网络

 ## 致谢
+
+## 附录
+
+在XTensor.h头文件中定义的成员变量说明：
+
+| 成员变量 | 功能 |
+| - | - |
+| XMem * mem | 张量所使用的内存池 |
+| void * data | 保存元素的数据数组 |
+| void * dataHost | 主机内存上的数据副本，只在GPU上运行时被激活 |
+| int devID | 设备ID，指张量所申请的空间所在CPU或者GPU设备的编号，-1表示CPU |
+| int order | 张量的维度，例如：一个矩阵（维度为2）是一个二维张量 |
+| int dimSize<br> [MAX_TENSOR_DIM_NUM] | 张量中每一维度的大小，索引0表示第1维 |
+| int dimSizeRDI<br> [MAX_TENSOR_DIM_NUM] | 转置模式下张量中每一维度的大小，索引0表示第1维 |
+| TENSOR_DATA_TYPE dataType | 每个数据单元的数据类型 |
+| int unitSize | 数据单元的大小，类似于sizeof() |
+| int unitNum | 数据单元的数量 |
+| bool isSparse | 是否稠密，一个n * m稠密矩阵的数据量大小为n * m,而稀疏（非稠密）矩阵的数据量大小则取决于矩阵中非零元素个数。|
+| int unitNumNonZero | 稀疏矩阵中非零元素个数 |
+| float denseRatio | 稠密度，指非零单元的比例，是介于0和1之间的一个实数，0表示所有单元全为零，1表示全为非零单元。|
+| bool isShared | 标志数据数组是否被其他张量所共享 |
+| bool isInGlobalMem | 标志数据是否在全局内存而不是内存池中 |
+| bool isAllValued<br> [MAX_TENSOR_DIM_NUM] | 标志稀疏矩阵中是否每个维度都具有非零元素 |
+
+在XTensor.h头文件中定义的方法说明：
+
+| 功能 | 函数  | 参数 |
+| - | - | - |
+| 判断两个张量数据类型<br>和大小是否相同 | static bool IsIdentical(<br> XTensor * a, XTensor * b) | a - 进行比较的第一个张量 <br> b - 进行比较的第二个张量 |
+| 判断三个张量数据类型<br>和大小是否相同 | static bool IsIdentical(<br> XTensor * a, XTensor * b, XTensor * c) | a - 进行比较的第一个张量 <br> b - 进行比较的第二个张量 <br> c - 进行比较的第三个张量 |
+| 设置张量每一维度的大小 | void SetDim(int * myDimSize) |myDimSize - 张量每一维度的大小 |
+| 得到张量中给定的维度大小 | int GetDim(const int dim) | dim - 张量的维度 |
+| 重新调整矩阵维度 | void Reshape(<br> const int order, const int * myDimSize) | order - 张量的维度 <br> myDimSize - 张量每一维的大小 |
+| 得到张量中元素数量 | int GetSize() | N/A |
+| 得到内存使用大小 | int GetDataSizeInChar() | N/A |
+| 得到所给数据类型的数据<br> 单元大小 | int GetUnitSize(<br> TENSOR_DATA_TYPE myDataType) | myDataType - 所给数据类型 |
+| 张量中所有元素设置为0 | void SetZeroAll(XStream * stream = NULL) | stream - 多线程流|
+| 用数组赋值张量 | void SetData(<br> const void * d, int num, int beg = 0) | d - 赋值数组  <br> num - 数组大小 <br> beg - 赋值时从张量的第几位开始 |
+| 设置张量服从均匀分布 | void SetDataRand(<br> DTYPE lower, DTYPE upper) | lower - 最小值 <br> upper - 最大值 |
+| 设置张量服从正态分布 | void SetDataRandn(<br> DTYPE mean, DTYPE standardDeviation) | mean - 均值 <br> standardDeviation - 标准差 |
+| 检查张量中元素是否相同 | bool CheckData(<br> const void * answer, int num, int beg = 0) | answer - 给定数组 <br> num - 数组大小 <br> beg - 赋值时从张量的第几位开始 |
+| 将给定维度中元素<br> 设置为升序 | void SetAscendingOrder(int dim) | dim - 给定维度 |
+| 获取张量中元素指针 | void * GetCell(int * index, int size)    | index - 元素位置 <br> size-矩阵大小 |
+| 获取二维张量中元素指针 | void * GetCell2D(int ni, int mi = 0) | ni - 行值 <br> mi - 列值 |
+| 获取二维张量的值 | DTYPE Get2D(int ni, int mi = 0) | ni - 行值 <br> mi - 列值 |
+| 获取稀疏张量的值 | DTYPE GetInSparse(int i) | i - 稀疏矩阵中非0元素位置 |
+| 获取稀疏张量中<br> 元组的键值 | int GetKeyInSparse(int i) | i - 稀疏矩阵中非0元素位置 |
+| 设置二维张量中<br> 的单元值 | bool Set2D(DTYPE value, int ni, int mi = 0) | value - 单元值 <br> ni - 行值 <br> mi - 列值 |
+| 增加二维张量中<br> 的单元值 | bool Add2D(DTYPE value, int ni, int mi = 0) | value - 单元值 <br> ni - 行值 <br> mi - 列值 |
+| 获取稀疏矩阵中<br> 非零元素数量 | int GetNonzeroSize() | N/A |
+| 将矩阵重置为特定大小 | bool Resize(<br> const int myOrder, <br> const int * myDimSize, <br> const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE, <br> const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
+| 将矩阵重置为特定大小<br>并不申请新空间 | bool ResizeWithNoData(<br> const int myOrder, <br> const int * myDimSize, <br> const TENSOR_DATA_TYPE myDataType = DEFAULT_DTYPE, <br> const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
+| 将矩阵重置为<br> 另一矩阵大小 | bool Resize(<br> const XTensor * myTensor) | myTensor - 重置矩阵大小的参考矩阵 |
+| 用二值搜索方法<br> 找到稀疏矩阵中元素 | bool BinarySearch(<br> int key, DTYPE &value, void * &position) | key - 稀疏矩阵中元素位置 <br> value - 元素值 <br> position - 元素坐标位置 |
+| 将数据刷新到<br> 目标设备中 | void FlushToMem(XMem * targetMem) | targetMem - 目标设备 |
+| 在全局内存中<br> 申请矩阵的内存空间 | static void AllocateData(<br> XTensor * matrix, <br> XMem * myMem = NULL, <br> bool useBuf = false) | matrix - 申请内存空间的矩阵 <br> myMem - 是否在内存池中申请空间 <br> useBuf - 是否使用缓冲区 |
+| 在全局内存中<br> 释放矩阵的内存空间 | static void FreeData(<br> XTensor * matrix, <br> XMem * myMem = NULL, <br> bool useBuf = false) | matrix - 申请内存空间的矩阵 <br> myMem - 是否在内存池中申请空间 <br> useBuf - 是否使用缓冲区 |
+| 在缓冲区创建张量 | XTensor * NewTensorBuf( <br> const int myOrder,  <br> const int * myDimSize, XMem * myMem, <br> const TENSOR_DATA_TYPE myDataType = <br> X_FLOAT, const float myDenseRatio = 1.0F) | myOrder - 张量的维度 <br> myDimSize - 张量每一维的大小,索引0表示第一维 <br> myMem - 张量所使用的内存池 <br>  myDataType - 张量的数据类型 <br> myDenseRatio - 张量的稠密度，1表示稠密张量 |
+| 依据给定张量<br>复制一个新的张量 | XTensor * NewTensor(<br>XTensor * a, bool isFilledData = true) | a - 给定张量 <br>  isFilledData - 是否申请张量中的数据空间 |
+| 依据给定张量<br>释放数据空间 | void DelTensor(<br>const XTensor * tensor) | tensor - 给定张量 |
+| 依据给定张量<br>在缓存中释放数据空间 | void DelTensorBuf(<br>const XTensor * tensor) | tensor - 给定张量 |
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -40,6 +40,10 @@ int main( int argc, const char ** argv )
        fprintf(stderr, "Run this program with \"-test\" for unit test!\n");
    }

+    XNet net;
+    XTensor a;
+    net.Backward(a);
+    
    //_CrtDumpMemoryLeaks();
    
    return 0;

--- a/source/network/XNet.cpp
+++ b/source/network/XNet.cpp
@@ -23,4 +23,126 @@

 namespace nts{

+unsigned int netIDGlobal = 0;
+MUTEX_HANDLE netMutex;
+
+/* generate a network id */
+unsigned int MakeNetID()
+{
+    if(tensorIDGlobal == 0)
+        MUTEX_INIT(netMutex);
+
+    MUTEX_LOCK(netMutex);
+    netIDGlobal += 3;
+    unsigned int id = netIDGlobal;
+    MUTEX_UNLOCK(netMutex);
+
+    return id;
+}
+
+/* constructor */
+XNet::XNet()
+{
+    nodes.Clear();
+}
+
+/* de-constructor */
+XNet::~XNet()
+{
+}
+
+/* clear the network */
+void XNet::Clear()
+{
+    nodes.Clear();
+    gradNodes.Clear();
+    outputs.Clear();
+    inputs.Clear();
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function 
+>> root - root node (output) of the network
+>> gold - gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XTensor &root, XTensor &gold, LOSS_FUNCTION_NAME loss)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    XList golds(1);
+    golds.Add(&gold);
+
+    Backward(roots, golds, loss);
+}
+
+/* 
+backward propagation to obtain gradient wrt. the loss/error function
+with a number of root nodes 
+>> root - a list of root nodes (output) of the network
+>> gold - a list of gold standard for the output
+>> loss - name of loss function
+*/
+void XNet::Backward(XList &roots, XList &golds, LOSS_FUNCTION_NAME loss)
+{
+    Traverse(roots);
+}
+
+/* 
+traverse the net and find the topological order by 
+depth-first search (Tarjan's algorithm) 
+>> root - root node (or output of the net)
+*/
+void XNet::Traverse(XTensor &root)
+{
+    XList roots(1);
+    roots.Add(&root);
+
+    Traverse(roots);
+}
+
+/* 
+traverse the net and find the topological order by 
+depth-first search (Tarjan's algorithm) 
+>> roots - a list of roots (or output nodes)
+*/
+void XNet::Traverse(XList &roots)
+{
+    id = MakeNetID();
+    nodes.Clear();
+ 
+    for (int i = 0; i < roots.count; i++)
+        TarjanVisit((XTensor*)roots.Get(i), nodes, id);
+
+}
+
+/* 
+depth-first search given a node (Tarjan's algorithm for topological ordering)
+>> node - the node to visit (mark 0:unvisited, 1:visiting, 2:done)
+>> orders - topological order of the nodes
+>> code - code of the network
+*/
+void XNet::TarjanVisit(XTensor * node, XList &orders, const unsigned int code)
+{
+    if(node == NULL)
+        return;
+
+    if(node->visitMark == code + 1){
+        ShowNTErrors("There is a circle in the network\n");
+    }
+    else if(node->visitMark <= code || node->visitMark >= code + 2){
+        node->visitMark = code + 1;
+        XLink &income = node->income;
+        for(int i = 0; i < income.tailNum; i++){
+            XTensor * child = income.tails[i];
+            if(child == NULL)
+                continue;
+            TarjanVisit(child, orders, code);
+        }
+        node->visitMark = code + 2;
+        orders.Add(node);
+    }
+}
+
 }
\ No newline at end of file
--- a/source/network/XNet.h
+++ b/source/network/XNet.h
@@ -30,9 +30,31 @@
 namespace nts{

 /* management of tensor net (or graph) */
-class XNet
+struct XNet
 {
-public:
+    /* id of the network */
+    unsigned int id;
+
+    /* tensor nodes of the network (in order) */
+    XList nodes;
+
+    /* tensor nodes to keep gradient for output (e.g., SGD)*/
+    XList gradNodes;
+
+    /* output nodes of the network */
+    XList outputs;
+
+    /* input nodes of the network */
+    XList inputs;
+
+    /* constructor */
+    XNet();
+
+    /* de-constructor */
+    ~XNet();
+
+    /* clear the network */
+    void Clear();

    /* backward propagation to obtain gradient wrt. the loss/error function */
    void Backward(XTensor &root, XTensor &gold = NULLTensor, LOSS_FUNCTION_NAME loss = NOLOSS);
@@ -40,8 +62,24 @@ public:
    /* backward propagation to obtain gradient wrt. the loss/error function
       with a number of root nodes */
    void Backward(XList &roots, XList &golds = NULLList, LOSS_FUNCTION_NAME loss = NOLOSS);
+
+    /* traverse the net and find the topological order by 
+       depth-first search (Tarjan's algorithm) */
+    void Traverse(XTensor &root);
+
+    /* traverse the net and find the topological order by 
+       depth-first search (Tarjan's algorithm) */
+    void Traverse(XList &roots);
+
+    /* depth-first search given a node (Tarjan's algorithm for topological ordering) */
+    void TarjanVisit(XTensor * node, XList &orders, const unsigned int code);
 };

+/* we make a unique id for every tensor */
+extern unsigned int netIDGlobal;
+extern MUTEX_HANDLE netMutex;
+extern unsigned int MakeNetID();
+
 }

 #endif
\ No newline at end of file
--- a/source/tensor/Main.cpp
+++ b/source/tensor/Main.cpp
@@ -53,8 +53,8 @@ int main( int argc, const char ** argv )

    if(argc > 1 && !strcmp(argv[1], "-test"))
        Test();
-    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
-        FNNLMMain(argc - 1, argv + 1);
+    //else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
+    //    FNNLMMain(argc - 1, argv + 1);
    else{
        fprintf(stderr, "Thanks for using NiuTrans.Tensor! This is a library that eases the\n");
        fprintf(stderr, "use of tensors. All you need is to ... \n\n");

--- a/source/tensor/XDataType.cpp
+++ b/source/tensor/XDataType.cpp
@@ -82,7 +82,7 @@ _XINLINE_ float Float16ToFloat(unsigned short h)
 }

 /* 
-data conversion
+data type conversion
 >> devID - device id
 >> s - source data array
 >> typeS - source data type
@@ -92,7 +92,7 @@ data conversion
 */
 void ConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
 {
-    CheckNTErrors((devID < 0), "This code must be run on GPUs!");
+    CheckNTErrors((devID < 0), "This code must be run on CPUs!");

    if(typeS == typeT)
        return;

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -37,6 +37,7 @@ XLink::XLink()
    paramNum = 0;
    type[0] = 0;
    typeID = 0;
+    caculator = NULL;
 }
    
 /* deconstructor */
@@ -59,6 +60,8 @@ void XLink::Reset()
    tailNum  = 0;
    paramNum = 0;
    type[0]  = 0;
+    typeID   = 0;
+    caculator = NULL;
 }

 /* clear it */
@@ -68,6 +71,8 @@ void XLink::Clear()
    tailNum  = 0;
    paramNum = 0;
    type[0]  = 0;
+    typeID   = 0;
+    caculator = NULL;
 }

 /* reset tails */
@@ -224,6 +229,7 @@ void XLink::AddParam(void * param, int size)
    paramNum++;
    delete[] (char*)ps;
 }
+
 /* 
 create a hyperedge with two input tensors and a output tensor 
 >> t1 - a tail tensor
@@ -249,7 +255,7 @@ create a hyper edge with a list of tensors and a output tensor
 >> h - head tensor
 >> id - id of the edge type
 */
-void XLink::MakeLink(XList * list, XTensor * h, int id)
+void XLink::MakeLink(const XList * list, XTensor * h, int id)
 {
    /* forward */
    XLink &income = h->income;
@@ -302,6 +308,43 @@ void XLink::AddParamToHeadInt(XTensor * h, int param)
 }

 /* 
+add a MATRIX_TRANS_TYPE parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(MATRIX_TRANS_TYPE));
+}
+
+/* 
+add a boolean parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadBool(XTensor * h, bool param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(bool));
+}
+
+/* 
+add a pointer parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadPointer(XTensor * h, void * param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(param));
+}
+
+
+/* 
 replace a node with another, i.e., we redirect the links to the new node 
 >> oldOne - the node to be replaced
 >> newOne - the new node

--- a/source/tensor/XLink.h
+++ b/source/tensor/XLink.h
@@ -77,6 +77,9 @@ struct XLink
    /* type id */
    int typeID;

+    /* caculator (pointer to the class for computation) */
+    void * caculator;
+    
    /* constuctor */
    XLink();
    
@@ -124,7 +127,7 @@ struct XLink

    /* create a hyper edge with a list of input tensors and a output tensor */
    static
-    void MakeLink(XList * list, XTensor * h, int id);
+    void MakeLink(const XList * list, XTensor * h, int id);

    /* add a parameter */
    static
@@ -134,6 +137,18 @@ struct XLink
    static
    void AddParamToHeadInt(XTensor * h, int param);

+    /* add a MATRIX_TRANS_TYPE parameter */
+    static
+    void AddParamToHeadTrans(XTensor * h, MATRIX_TRANS_TYPE param);
+
+    /* add a boolean parameter */
+    static
+    void AddParamToHeadBool(XTensor * h, bool param);
+
+    /* add a pointer parameter */
+    static
+    void AddParamToHeadPointer(XTensor * h, void * param);
+
    /* replace a node with another, i.e., we redirect the links to the new node */
    static 
    void Replace(const XTensor * oldOne, XTensor * newOne);

--- a/source/tensor/XList.cpp
+++ b/source/tensor/XList.cpp
@@ -206,7 +206,7 @@ void XList::Insert(int pos, void * item)
 }

 /* get the item at position i */
-void * XList::GetItem(int i)
+void * XList::GetItem(int i) const
 {
    if( i >= 0 && i < count )
        return items[i];

--- a/source/tensor/XList.h
+++ b/source/tensor/XList.h
@@ -74,7 +74,7 @@ public:
    void AddList(XList * l);
    void AddInt(int i);
    void Insert(int pos, void * item);
-    void * GetItem(int i);   
+    void * GetItem(int i) const;   
    int GetItemInt(int i);
    void SetItem(int i, void * item);
    void SetItemInt(int i, int item);

--- a/source/tensor/XName.cpp
+++ b/source/tensor/XName.cpp
@@ -27,12 +27,56 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 const char * GetOPName(int type)
 {
    if((type & MATH_ARITHMETIC) != 0){
-        if(type == MATH_SUM)
-            return "M_SUM";
+        if(type == MATH_ABSOLUTE)
+            return "M_ABSOLUTE";
+        else if(type == MATH_MATRIXMUL)
+            return "M_MATRIXMUL";
+        else if(type == MATH_MATRIXMULBATCHED)
+            return "M_MATRIXMULBATCHED";
        else if(type == MATH_MULTIPLY)
            return "M_MULTIPLY";
+        else if(type == MATH_NEGATE)
+            return "M_NEGATE";
+        else if(type == MATH_SIGN)
+            return "M_SIGN";
+        else if(type == MATH_SUM)
+            return "M_SUM";
+        else if(type == MATH_LOG)
+            return "M_NORMALIZE";
+        else if(type == MATH_NORMALIZE)
+            return "M_LOG";
+        else if(type == MATH_POWER)
+            return "M_POWER";
        else if(type == MATH_SCALEANDSHIFT)
            return "M_SCALEANDSHIFT";
+        else if(type == GETANDSET_SELECT)
+            return "G_SELECT";
+        else if(type == MOVEMENT_COPYINDEXED)
+            return "M_COPYINDEXED";
+        else if(type == MOVEMENT_COPYVALUES)
+            return "M_COPYVALUES";
+        else if(type == REDUCE_REDUCEMAX)
+            return "R_REDUCEMAX";
+        else if(type == REDUCE_REDUCEMEAN)
+            return "R_REDUCEMEAN";
+        else if(type == REDUCE_REDUCESUM)
+            return "R_REDUCESUM";
+        else if(type == REDUCE_REDUCESUMSQUARED)
+            return "R_REDUCESUMSQUARED";
+        else if(type == REDUCE_REDUCEVARIANCE)
+            return "R_REDUCEVARIANCE";
+        else if(type == SHAPE_CONCATENATE)
+            return "S_CONCATENATE";
+        else if(type == SHAPE_MERGE)
+            return "S_MERGE";
+        else if(type == SHAPE_PERMUTE)
+            return "S_PERMUTE";
+        else if(type == SHAPE_SPLIT)
+            return "S_SPLIT";
+        else if(type == SHAPE_TRANSPOSE)
+            return "S_TRANSPOSE";
+        else if(type == SHAPE_UNSQUEEZE)
+            return "S_UNSQUEEZE";
    }
    
    return "NULL";

--- a/source/tensor/XName.h
+++ b/source/tensor/XName.h
@@ -29,9 +29,40 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #define MATH_ARITHMETIC     0x00001000
-#define MATH_SUM            MATH_ARITHMETIC + 1
-#define MATH_MULTIPLY       MATH_SUM + 1
-#define MATH_SCALEANDSHIFT  MATH_MULTIPLY + 1
+#define MATH_ABSOLUTE           MATH_ARITHMETIC + 1
+#define MATH_MATRIXMUL          MATH_ABSOLUTE + 1
+#define MATH_MATRIXMULBATCHED   MATH_MATRIXMUL + 1
+#define MATH_MULTIPLY           MATH_MATRIXMULBATCHED + 1
+#define MATH_NEGATE             MATH_MULTIPLY + 1
+#define MATH_SIGN               MATH_NEGATE + 1
+#define MATH_SUM                MATH_SIGN + 1
+
+#define MATH_LOG                MATH_SUM + 1
+#define MATH_NORMALIZE          MATH_LOG + 1
+#define MATH_POWER              MATH_NORMALIZE + 1
+#define MATH_SCALEANDSHIFT      MATH_POWER + 1
+
+#define GETANDSET               MATH_SCALEANDSHIFT + 1
+#define GETANDSET_SELECT        GETANDSET + 1
+
+#define MOVEMENT                GETANDSET_SELECT + 1
+#define MOVEMENT_COPYINDEXED    MOVEMENT + 1
+#define MOVEMENT_COPYVALUES     MOVEMENT_COPYINDEXED + 1
+
+#define REDUCE                  MOVEMENT_COPYVALUES + 1
+#define REDUCE_REDUCEMAX        REDUCE + 1
+#define REDUCE_REDUCEMEAN       REDUCE_REDUCEMAX + 1
+#define REDUCE_REDUCESUM        REDUCE_REDUCEMEAN + 1
+#define REDUCE_REDUCESUMSQUARED REDUCE_REDUCESUM + 1
+#define REDUCE_REDUCEVARIANCE   REDUCE_REDUCESUMSQUARED + 1
+
+#define SHAPE                   REDUCE_REDUCEVARIANCE + 1
+#define SHAPE_CONCATENATE       SHAPE + 1
+#define SHAPE_MERGE             SHAPE_CONCATENATE + 1
+#define SHAPE_PERMUTE           SHAPE_MERGE + 1
+#define SHAPE_SPLIT             SHAPE_PERMUTE + 1
+#define SHAPE_TRANSPOSE         SHAPE_SPLIT + 1
+#define SHAPE_UNSQUEEZE         SHAPE_TRANSPOSE + 1

 /* get operator name */
 const char * GetOPName(int type);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -173,7 +173,7 @@ XTensor::XTensor(const XTensor &reference)
        devID = reference.devID;
        mem = reference.mem;
        InitTensor(this, &reference);
-        CopyValues(&reference, this);
+        _CopyValues(&reference, this);
    }

    if(reference.isTmp)
@@ -237,6 +237,7 @@ void XTensor::Init()
    memset(isAllValued, 0, sizeof(bool) * MAX_TENSOR_DIM_NUM);
    isInit = false;
    isTmp =  false;
+    visitMark = 0;
 }

 /* delete data arrays */
@@ -299,7 +300,7 @@ XTensor& XTensor::operator= (const XTensor& tensor)
        }

        Resize(tensor.order, tensor.dimSize, tensor.dataType, tensor.denseRatio);
-        CopyValues(&tensor, this);
+        _CopyValues(&tensor, this);
    }

    /* copy member variables */
@@ -344,7 +345,7 @@ judge whether the two matrices are in the same type and size
 >> b - anther tensor to compare with
 << return - whether the two input tensors are identical
 */
-bool XTensor::IsIdentical(XTensor * a, XTensor * b)
+bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
 {
    if(a->order != b->order)
        return false;
@@ -426,7 +427,7 @@ void XTensor::Reshape(const int myOrder, const int * myDimSize)
 }

 /* get the number of items in the data array */
-int XTensor::GetSize()
+int XTensor::GetSize() const
 {
    if(isSparse)
        return unitNumNonZero;
@@ -742,7 +743,7 @@ get the pointer to a cell
 >> size - size of index
 << return - pointer to the cell
 */
-void * XTensor::GetCell(int index[], int size)
+void * XTensor::GetCell(int index[], int size) const
 {
    CheckNTErrors((size == order), "Illegal index!");

@@ -794,7 +795,7 @@ get the value of a cell in a 2d tensor in default type
 >> mi - column index
 << return - value of cell(ni, mi) in float
 */
- DTYPE XTensor::Get2D(int ni, int mi)
+DTYPE XTensor::Get2D(int ni, int mi) const
 {
    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
@@ -1242,7 +1243,7 @@ binary search to find an element in a sparse tensor
              it is the previous one if there is no hit
 << return - find it or not?
 */
-bool XTensor::BinarySearch(int key, DTYPE &value, void * &position)
+bool XTensor::BinarySearch(int key, DTYPE &value, void * &position) const
 {
    CheckNTErrors((isSparse), "A sparse tensor is required!");
    CheckNTErrors((dataType == DEFAULT_DTYPE), "The tensor is not in the default type.");

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -139,6 +139,9 @@ public:
    /* indicates whether the tensor is created temporarily */
    bool isTmp;

+    /* mark for traversing the gragh */
+    unsigned int visitMark;
+    
    /*
    the link used to form networks. Note that when we compute on tensors, we actually create a
    network where nodes are tensors and edges the connections among them. Each connection is
@@ -198,7 +201,7 @@ public:

    /* judge whether the two matrices are in the same type and size */
    static
-    bool IsIdentical(XTensor * a, XTensor * b);
+    bool IsIdentical(const XTensor * a, const XTensor * b);

    /* judge whether the three matrices are in the same type and size */
    static
@@ -214,7 +217,7 @@ public:
    void Reshape(const int order, const int * myDimSize);

    /* get the number of items in the data array */
-    int GetSize();
+    int GetSize() const;

    /* get size of the memory used */
    int GetDataSizeInChar();
@@ -250,13 +253,13 @@ public:
    DTYPE Get(int index[], int size = -1);

    /* get the pointer to a cell */
-    void * GetCell(int index[], int size = -1);
+    void * GetCell(int index[], int size = -1) const;

    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i);

    /* get the default type value of a cell in a 2d tensor */
-    DTYPE Get2D(int ni, int mi);
+    DTYPE Get2D(int ni, int mi) const;
    
    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);
@@ -311,7 +314,7 @@ public:
    bool Resize(const XTensor * myTensor);

    /* binary search to find an element in a sparse matrix*/
-    bool BinarySearch(int key, DTYPE &value, void * &position);
+    bool BinarySearch(int key, DTYPE &value, void * &position) const;

    /* dump data to a file */
    void Dump(FILE * file, const char * label = NULL, const int n = -1, const int verbose = 0);

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -19,6 +19,7 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */

+#include <math.h>
 #include "../../XTensor.h"
 #include "Absolute.h"
 #include "Absolute.cuh"
@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its absolute value
 >> a - the tensor we are processing
 */
-void Absolute(XTensor * a)
+void _Absolute(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaAbsolute(a);
+        _CudaAbsolute(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -58,7 +58,7 @@ set each entry to its  with float16 data type value
 >> a - the tensor
 */
 extern "C"
-void CudaAbsolute(XTensor * a)
+void _CudaAbsolute(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");


--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -35,7 +35,7 @@ void KernelAbsolute(__half * d, int size);

 /* set each entry to its absolute value */
 extern "C"
-void CudaAbsolute(XTensor * a);
+void _CudaAbsolute(XTensor * a);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/Absolute.h
+++ b/source/tensor/core/arithmetic/Absolute.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its absolute value */
 extern "C"
-void Absolute(XTensor * a);
+void _Absolute(XTensor * a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
@@ -37,8 +37,8 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> alpha - scalar
 >> beta - scalar
 */
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
-                         XList * b, MATRIX_TRANS_TYPE transposedB,
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
+                         const XList * b, MATRIX_TRANS_TYPE transposedB,
                         XList * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input lists!");
@@ -73,11 +73,11 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        CheckNTErrors((ci->order == 2), "2d tensor (i.e., matrix) is required!");
 #ifdef USE_BLAS
        if (useBLAS)
-            MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMULCPU(ai, transposedA, bi, transposedB, ci, alpha, beta);
        else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #else
-        MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
+        _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta);
 #endif
    }
    //}

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* matrix multiplication in batch mode (CPU code) */
 extern "C"
-void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul.cpp
@@ -30,34 +30,34 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-matrix multiplication. For the input tensors a and b, we perform matrix multiplication
-on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
-a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y)
-and each order-2 tensor B (of size z * x), like this
-c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
-where trans() returns the transposed matrix if the flag is fired, ai is the i-th
-element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element
-tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
-normal matrix multiplication if A = y * z and B = x * y.
+matrix multiplication
+
+For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
 >> transposedB - indicates whether teh matrices in b are transposed
->> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-               XTensor * b, MATRIX_TRANS_TYPE transposedB,
-               XTensor * c, DTYPE alpha, DTYPE beta,
-               XPRunner * parallelRunner)
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+               const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+               XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
-                  "Input tensors must have a order > 2!");
+                  "Input tensors must have a order >= 2!");

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -132,7 +132,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            XTensor * ai = (XTensor*)aList->GetItem(i);
            XTensor * bi = (XTensor*)bList->GetItem(i);
            XTensor * ci = (XTensor*)cList->GetItem(i);
-            MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
+            _MatrixMul2D(ai, transposedA, bi, transposedB, ci, alpha, beta, parallelRunner);
        }
    }
    else if (a->devID >= 0 && b->devID >= 0 && c->devID >= 0) {
@@ -144,7 +144,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        ProtectCudaDev(a->devID, devIDBackup);

        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        CudaBLASMatrixMULList(handle,
+        _CudaBLASMatrixMULList(handle,
                              aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
@@ -157,7 +157,7 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMULBatchedCPU(aList, transposedA,
            bList, transposedB,
            cList, alpha, beta);
    }
@@ -184,4 +184,74 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete bList;
    delete cList;
 }
+
+/*
+matrix multiplication (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+For the input tensors a and b, we perform matrix multiplication on the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B be a tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A, 
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element tensor of the result C. 
+The result C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> beta - another coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication
+*/
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
+                 DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+{
+    CheckNTErrors((&a && &b), "Empty input tensors!");
+    CheckNTErrors((a.dataType == b.dataType), "Input tensors should have the same data type!");
+    CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
+
+    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a.order + b.order - 2;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a.order; i++)
+        dimSize[sub++] = a.dimSizeRDI[i];
+    for (int i = 2; i < b.order; i++)
+        dimSize[sub++] = b.dimSizeRDI[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+
+    /* call _MatrixMul function */
+    _MatrixMul(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMUL);
+    XLink::AddParamToHeadTrans(&c, transposedA);
+    XLink::AddParamToHeadTrans(&c, transposedB);
+    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, beta);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return c;
+}
+
 } // namespace nts(NiuTrans.Tensor)
+
+
+
--- a/source/tensor/core/arithmetic/MatrixMul.h
+++ b/source/tensor/core/arithmetic/MatrixMul.h
@@ -27,18 +27,34 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-matrix multiplication. For the input tensors a and b, we perform matrix multiplication
-on the first two dimentsions. E.g., let A be a tensor of size y * z * m and B be
-a tensor of size x * y * n. For A * B, we go over each order-2 tensor of A (of size x * y)
-and each order-2 tensor B (of size z * x), like this
-c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
-where trans() returns the transposed matrix if the flag is fired, ai is the i-th
-element tensor of A, bj is the j-th element tensor of B, and c_{i,j} is the (i,j) element
-tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
-normal matrix multiplication if A = y * z and B = x * y.
+matrix multiplication
+
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
+*/
+void _MatrixMul(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+
+/* 
+matrix multiplication (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
+For the input tensors a and b, we perform matrix multiplicationon the first two dimentsions. 
+E.g., let A be a tensor of size y * z * m and B bea tensor of size x * y * n. 
+For A * B, we go over each order-2 tensor of A (of size x * y) and each order-2 tensor B (of size z * x), 
+like this c_{i,j} = trans(ai) * trans(bj) * alpha + c_{i,j} * beta
+where trans() returns the transposed matrix if the flag is fired, ai is the i-th element tensor of A,
+bj is the j-th element tensor of B, and c_{i,j} is the (i,j) elementtensor of the result C. 
+C should be a tensor of z * x * n * m. 
+Obviously C = A * B performs normal matrix multiplication if A = y * z and B = x * y.
 */
-extern "C"
-void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+XTensor MatrixMul(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB, 
                 DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2D.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cpp
@@ -30,8 +30,10 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 matrix multiplication (for 2d tensors)
+
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -42,8 +44,8 @@ where trans() return the transposed matrix if the flag is fired
 >> parallelRunner - parallel processing module
 >> stream - the string for creating the job pipeline
 */
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                 XTensor * b, MATRIX_TRANS_TYPE transposedB,
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                 const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                 XTensor * c, DTYPE alpha, DTYPE beta,
                 XPRunner * parallelRunner, XStream * stream)
 {
@@ -67,7 +69,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,

 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
+        _CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
        return;
    }
 #endif
@@ -81,9 +83,9 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            c->dataType == DEFAULT_DTYPE)
        {
            if (useBLAS)
-                MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
+                _MatrixMULCPU(a, transposedA, b, transposedB, c, alpha, beta);
            else
-                MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
+                _MatrixMul2DParallel(a, transposedA, b, transposedB, c, alpha, beta, parallelRunner);
        }
        else {
            // TODO!!

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -108,8 +108,10 @@ void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, in

 /*
 matrix multiplication (for 2d tensors) (cuda version)
+
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -119,8 +121,8 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> stream - the string for creating the job pipeline
 */
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                     XTensor * b, MATRIX_TRANS_TYPE transposedB,
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                     const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                     XTensor * c,
                     DTYPE alpha, DTYPE beta, XStream * stream)
 {
@@ -156,7 +158,7 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            cublasSetStream(*handle, stream->stream);

        if (a->dataType == X_FLOAT && b->dataType == X_FLOAT && c->dataType == X_FLOAT) {
-            CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
+            _CudaBLASMatrixMUL(handle, a->data, transposedA, a->dataType, b->data, transposedB, a->dataType, c->data, c->dataType,
                a->dimSize[0], a->dimSize[1], b->dimSize[0], b->dimSize[1], c->dimSize[0], c->dimSize[1],
                alpha, beta);
        }

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
 extern "C"
-void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);

 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/MatrixMul2D.h
+++ b/source/tensor/core/arithmetic/MatrixMul2D.h
@@ -31,8 +31,7 @@ matrix multiplication (for 2d tensors)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
-extern "C"
-void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
                 DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.cpp
@@ -38,7 +38,7 @@ argument5: matrix a
 argument6: matrix b
 argument7: matrix c (c=a*b*\alpha + c*beta)
 */
-void MatrixMul2DMultiTheading(XList * args)
+void _MatrixMul2DMultiTheading(XList * args)
 {
    int x1 = *(int*)args->GetItem(0);
    int y1 = *(int*)args->GetItem(1);

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -31,7 +31,7 @@ matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
 extern "C"
-void MatrixMul2DMultiTheading(XList * args);
+void _MatrixMul2DMultiTheading(XList * args);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.cpp
@@ -30,6 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication (for 2d tensors) with multi-threading
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -39,10 +40,9 @@ where trans() return the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                         XTensor * b, MATRIX_TRANS_TYPE transposedB,
-                         XTensor * c, DTYPE alpha, DTYPE beta,
-                         XPRunner * parallelRunner)
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                         const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+                         XTensor * c, DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
@@ -56,7 +56,7 @@ void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA,

    /* a * b */
    if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
-        RunParallel2D(parallelRunner, (void*)MatrixMul2DMultiTheading, an * am * bm,
+        RunParallel2D(parallelRunner, (void*)_MatrixMul2DMultiTheading, an * am * bm,
                      cn, cm, 5,
                      a, b, c, &alpha, &beta);
    }

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
@@ -27,12 +27,12 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-matrix multiplication (for 2d tensors) with multi-threading
+matrix multiplication (for 2d tensors) with multi-threading.
 c = trans(a) * trans(b) * alpha + c * beta
-where trans() return the transposed matrix if the flag is fired
+where trans() return the transposed matrix if the flag is fired.
 */
 extern "C"
-void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -30,10 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 matrix multiplication of the two tensors
+
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
+
 >> a - tensor a
 >> transposedA - indicates whether the matrices in a are transposed
 >> b - tensor b
@@ -43,8 +45,8 @@ where trans() returns the transposed matrix if the flag is fired
 >> beta - another coefficient
 >> parallelRunner - parallel processing module
 */
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-                      XTensor * b, MATRIX_TRANS_TYPE transposedB,
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                      const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                      XTensor * c, DTYPE alpha, DTYPE beta,
                      XPRunner * parallelRunner)
 {
@@ -52,7 +54,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Input tensors should have the same data type!");
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
-                  "Input tensors must have a order > 2!");
+                  "Input tensors must have a order >= 2!");
+    CheckNTErrors((a->order == b->order && a->order == c->order), 
+                  "Input tensor and output tensor must have same order!");

    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
@@ -109,7 +113,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        ProtectCudaDev(a->devID, devIDBackup);

        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-        CudaBLASMatrixMULList(handle,
+        _CudaBLASMatrixMULList(handle,
 							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
@@ -122,7 +126,7 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    }
    else {
        CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
-        MatrixMULBatchedCPU(aList, transposedA,
+        _MatrixMULBatchedCPU(aList, transposedA,
            bList, transposedB,
            cList, alpha, beta);
    }
@@ -150,4 +154,65 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    delete cList;
 }

+/*
+matrix multiplication of the two tensors (do it on site)
+make a new tensor to keep the result and return it
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired.
+
+>> a - tensor a
+>> transposedA - indicates whether the matrices in a are transposed
+>> b - tensor b
+>> transposedB - indicates whether teh matrices in b are transposed
+>> alpha - a coefficient
+>> beta - another coefficient
+>> parallelRunner - parallel processing module
+<< return - the result of matrix multiplication of the two tensors
+*/
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
+                        DTYPE alpha, DTYPE beta, XPRunner * parallelRunner)
+{
+    CheckNTErrors((&a && &b), "Empty input tensors!");
+    CheckNTErrors(a.dataType == b.dataType, "Input tensors should have the same data type!");
+    CheckNTErrors((a.order >= 2 && b.order >= 2), "Input tensors must have a order >= 2!");
+    CheckNTErrors(a.order == b.order, "Input tensor and output tensor must have same order!");
+
+    int an = transposedA == X_TRANS ? a.dimSizeRDI[0] : a.dimSizeRDI[1];
+    int am = transposedA == X_TRANS ? a.dimSizeRDI[1] : a.dimSizeRDI[0];
+    int bn = transposedB == X_TRANS ? b.dimSizeRDI[0] : b.dimSizeRDI[1];
+    int bm = transposedB == X_TRANS ? b.dimSizeRDI[1] : b.dimSizeRDI[0];
+
+    CheckNTErrors(am == bn, "Unmatched tensors in multiplication!");
+
+    int order = a.order;
+    int sub = 0;
+    int * dimSize = new int[order];
+    for (int i = 2; i < a.order; i++)
+        dimSize[sub++] = a.dimSizeRDI[i];
+    dimSize[sub++] = an;
+    dimSize[sub++] = bm;
+
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+
+    /*call _MatrixMulBatched function */
+    _MatrixMulBatched(&a, transposedA, &b, transposedB, &c, alpha, beta, parallelRunner);
+
+    /* tensor connections */
+    XLink::MakeLink(&a, &b, &c, MATH_MATRIXMULBATCHED);
+    XLink::AddParamToHeadTrans(&c, transposedA);
+    XLink::AddParamToHeadTrans(&c, transposedB);
+    XLink::AddParamToHead(&c, alpha);
+    XLink::AddParamToHead(&c, beta);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return c;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/arithmetic/MatrixMulBatched.h
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.h
@@ -28,13 +28,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 matrix multiplication of the two tensors
+
+for each 2-dimensional data array in a (denoted as ai) and
+each 2-dimensional data array in b (denoted as bi), we have
+ci = trans(ai) * trans(bi) * alpha + cm * beta
+where trans() returns the transposed matrix if the flag is fired
+*/
+void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB,
+    XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+
+/*
+matrix multiplication of the two tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
 for each 2-dimensional data array in a (denoted as ai) and
 each 2-dimensional data array in b (denoted as bi), we have
 ci = trans(ai) * trans(bi) * alpha + cm * beta
 where trans() returns the transposed matrix if the flag is fired
 */
-extern "C"
-void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
+XTensor MatrixMulBatched(const XTensor &a, MATRIX_TRANS_TYPE transposedA, const XTensor &b, MATRIX_TRANS_TYPE transposedB,
    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Multiply.cpp
+++ b/source/tensor/core/arithmetic/Multiply.cpp
@@ -28,14 +28,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 element-wise product of two tensors
+
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item
+
 >> a - matrix a
 >> b - matrix b
 >> c - result matrix
 >> alpha - the coefficient
 >> leadingDim - the dimension along which we perform broadcasting
->>
 */
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
@@ -121,9 +122,12 @@ void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, i
 }

 /*
-element-wise product of two tensors and keep the result in the input
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+
 a(i) = a(i)*b(i) + \alpha * a(i)
 where i is the index of the item
+
 >> a - tensor a (where keep the result)
 >> b - tensor b
 >> alpha - the coefficient
@@ -135,9 +139,12 @@ void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha, int leadingDim)
 }

 /*
-make a tensor of the element-wise product for two input tensors: 
+element-wise product of two tensors (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
 c(i) = a(i)*b(i) + \alpha * c(i)
 where i is the index of the item
+
 >> a - tensor a
 >> b - tensor b
 >> alpha - the coefficient
@@ -151,7 +158,7 @@ XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha, int leadingDim
    XTensor c(&a);
    c.SetTMP();
    
-    /* computation */
+    /* call _Multiply function */
    _Multiply(&a, &b, &c, alpha, leadingDim);
    
    /* tensor connections */

--- a/source/tensor/core/arithmetic/Multiply.h
+++ b/source/tensor/core/arithmetic/Multiply.h
@@ -26,19 +26,27 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* element-wise product of two tensors: 
-   c(i) = a(i)*b(i) + \alpha * c(i) 
-   where i is the index of the element */
+/* 
+element-wise product of two tensors:
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element
+*/
 void _Multiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);

-/* element-wise product of two tensors and keep the result in the input tensor: 
-   a(i) = a(i)*b(i) + \alpha * a(i) 
-   where i is the index of the element */
+/* 
+element-wise product of two tensors (do it on site)
+keep the result in the input tensor a and return nothing
+a(i) = a(i)*b(i) + \alpha * a(i) 
+where i is the index of the element 
+*/
 void _MultiplyMe(XTensor * a, const XTensor * b, DTYPE alpha = 0, int leadingDim = 0);

-/* make a tensor of the element-wise product for two input tensors: 
-   c(i) = a(i)*b(i) + \alpha * c(i) 
-   where i is the index of the element */
+/* 
+element-wise product of two tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+c(i) = a(i)*b(i) + \alpha * c(i) 
+where i is the index of the element 
+*/
 XTensor Multiply(const XTensor &a, const XTensor &b, DTYPE alpha = 0, int leadingDim = 0);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its minus value
 >> a - the tensor we are processing
 */
-void Negate(XTensor * a)
+void _Negate(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaNegate(a);
+        _CudaNegate(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -66,7 +66,7 @@ set each entry to its negtive value
 >> a - the tensor
 */
 extern "C"
-void CudaNegate(XTensor * a)
+void _CudaNegate(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");


--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
@@ -19,6 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

+#ifndef __NEGATE_CUH__
+#define __NEGATE_CUH__
+
 #include "Negate.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelNegate(__half * d, int size);

 /* set each entry to its negtive value */
 extern "C"
-void CudaNegate(XTensor * a);
+void _CudaNegate(XTensor * a);

 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
+
+#endif // __NEGATE_CUH__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Negate.h
+++ b/source/tensor/core/arithmetic/Negate.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its minus value */
 extern "C"
-void Negate(XTensor * a);
+void _Negate(XTensor * a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -29,12 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its sign value
 >> a - the tensor we are processing
 */
-void Sign(XTensor * a)
+void _Sign(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaSign(a);
+        _CudaSign(a);
    return;
 }
 #endif

--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -64,7 +64,7 @@ set each entry to its  with float16 data type value
 >> a - the tensor
 */
 extern "C"
-void CudaSign(XTensor * a)
+void _CudaSign(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");


--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */

+#ifndef __SIGN_CUH__
+#define __SIGN_CUH__
+
 #include "Sign.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelSign(__half * d, int size);

 /* set each entry to its sign value */
 extern "C"
-void CudaSign(XTensor * a);
+void _CudaSign(XTensor * a);

 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
+
+#endif // __SIGN_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/Sign.h
+++ b/source/tensor/core/arithmetic/Sign.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its sign value */
 extern "C"
-void Sign(XTensor * a);
+void _Sign(XTensor * a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/Sum.cpp
+++ b/source/tensor/core/arithmetic/Sum.cpp
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 tensor summation c = a + b * \beta
-return a pointer
+
 >> a - a tensor
 >> b - another tensor
 >> c - where we put a+b*\beta. we save it in a if c is NULL
@@ -112,8 +112,9 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 }
    
 /*
-tensor summation a = a + b * \beta
-do it on site
+tensor summation a = a + b * \beta (do it on site)
+keep the result in the tensor a and return nothing
+
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
@@ -124,18 +125,20 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
 }
    
 /*
-tensor summation a = a + b * \beta
-return a XTensor structure
+tensor summation c = a + b * \beta (return a XTensor structure)
+make a new tensor c to keep the result and return it
+
 >> a - a tensor
 >> b - another tensor
 >> beta - the scaling factor
+<< return - the result of tensor summation
 */
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
 {
    XTensor c(&a);
    c.SetTMP();
    
-    /* computation */
+    /* call _Sum function */
    _Sum(&a, &b, &c, beta);
    
    /* tensor connections */

--- a/source/tensor/core/arithmetic/Sum.h
+++ b/source/tensor/core/arithmetic/Sum.h
@@ -29,10 +29,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /* tensor summation c = a + b * \beta */
 void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

-/* tensor summation a = a + b * \beta (return a pointer) */
+/* 
+tensor summation a = a + b * \beta
+keep the result in the input tensor a and return nothing
+*/
 void _SumMe(XTensor * a, const XTensor * b, DTYPE beta = (DTYPE)1.0);
    
-/* tensor summation c = a + b * \beta (return a structure) */
+/*
+tensor summation c = a + b * \beta
+make a new tensor c to keep the result and return it
+*/
 XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
@@ -37,11 +37,8 @@ where b is a vector.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
-
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
@@ -56,7 +53,7 @@ void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)

    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-        CudaSumByColumnTV(a, b, c, beta);
+        _CudaSumByColumnTV(a, b, c, beta);
 #endif
    }
    else {

--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
@@ -64,11 +64,8 @@ where b is a vector.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
-
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),

--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* summation of a tensor and a vector (column vector) */
 extern "C"
-void CudaSumByColumnTV(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* sum of a tensor and a (column) vector */
 extern "C"
-void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
@@ -37,11 +37,8 @@ where c and a are vectors, and b_col is a column in b.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
-
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
@@ -49,7 +46,7 @@ void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)

    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA
-        CudaSumByColumnVT(a, b, c, beta);
+        _CudaSumByColumnVT(a, b, c, beta);
 #endif
    }
    else {

--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
@@ -80,11 +80,8 @@ where c and a are vectors, and b_col is a column in b.
 >> c - where we put a+b. we save it in a if c is NULL
 >> beta - the scaling factor
 */
-void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
+void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
-    if (c == NULL)
-        c = a;
-
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),

--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* summation of a vector (column vector) and a tensor */
 extern "C"
-void CudaSumByColumnVT(XTensor * a, XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
+void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);

 #endif // USE_CUDA


--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* sum of a (column) vector and a tensor */
 extern "C"
-void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
+void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -36,8 +36,8 @@ c = trans(a) * trans(b) * \alpha + c * \beta
 >> beta - scalar
 >> c - output matrix (2d tensor)
 */
-void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA,
-    XTensor * b, MATRIX_TRANS_TYPE transposedB,
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
+                  const XTensor * b, MATRIX_TRANS_TYPE transposedB,
                  XTensor * c, DTYPE alpha, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -31,9 +31,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMUL(cublasHandle_t * handle,
-                       void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-                       void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+void _CudaBLASMatrixMUL(cublasHandle_t * handle,
+                       const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+                       const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
                       void * c, TENSOR_DATA_TYPE dataTypeC,
                       int na, int ma, int nb, int mb, int nc, int mc,
                       DTYPE alpha, DTYPE beta)
@@ -88,7 +88,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
                              void ** c, TENSOR_DATA_TYPE dataTypeC,
@@ -144,7 +144,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,

 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
                                     void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
@@ -201,9 +201,9 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
 /*
 matrix multiplication via cuda version BLAS
 */
-void CudaBLASMatrixMULList(cublasHandle_t * handle,
-                           XList * a, MATRIX_TRANS_TYPE transposedA,
-                           XList * b, MATRIX_TRANS_TYPE transposedB,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle,
+                           const XList * a, MATRIX_TRANS_TYPE transposedA,
+                           const XList * b, MATRIX_TRANS_TYPE transposedB,
                           XList * c,
                           int count, DTYPE alpha, DTYPE beta)
 {
@@ -255,7 +255,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
    if (isUniform) {
        XMem * mem = a0->mem;
        if (isStrided && a->count > 1) {
-            CudaBLASMatrixMULBatchedStrided(handle,
+            _CudaBLASMatrixMULBatchedStrided(handle,
                                            a0->data, transposedA, a0->dataType, strideA / a0->unitSize,
                                            b0->data, transposedB, b0->dataType, strideB / b0->unitSize,
                                            c0->data, c0->dataType, strideC / c0->unitSize, a->count,
@@ -297,7 +297,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
            cudaMemcpy(bpGPU, bp, sizeof(DTYPE*) * b->count, cudaMemcpyHostToDevice);
            cudaMemcpy(cpGPU, cp, sizeof(DTYPE*) * c->count, cudaMemcpyHostToDevice);

-            CudaBLASMatrixMULBatched(handle,
+            _CudaBLASMatrixMULBatched(handle,
                                    (const void**)apGPU, transposedA, a0->dataType,
                                    (const void**)bpGPU, transposedB, b0->dataType,
                                    (void**)cpGPU, c0->dataType, a->count,
@@ -324,7 +324,7 @@ void CudaBLASMatrixMULList(cublasHandle_t * handle,
            XTensor * bi = (XTensor*)b->GetItem(i);
            XTensor * ci = (XTensor*)c->GetItem(i);

-            CudaBLASMatrixMUL(handle,
+            _CudaBLASMatrixMUL(handle,
                              ai->data, transposedA, ai->dataType,
                              bi->data, transposedB, bi->dataType,
                              ci->data, ci->dataType,

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -28,21 +28,21 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* matrix multiplication (BLAS) */
 extern "C"
-void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
+void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

 #ifdef USE_CUDA

 /* matrix multiplication via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMUL(cublasHandle_t * handle,
-    void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
-    void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
+void _CudaBLASMatrixMUL(cublasHandle_t * handle,
+    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
+    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
    void * c, TENSOR_DATA_TYPE dataTypeC,
    int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
    const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
    const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
    void ** c, TENSOR_DATA_TYPE dataTypeC,
@@ -50,7 +50,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,

 /* matrix multiplication in batch and strided mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
+void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
    const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
    const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
    void * c, TENSOR_DATA_TYPE dataTypeC, long long int strideC,
@@ -58,7 +58,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,

 /* matrix multiplication in batch mode via cuda version BLAS */
 extern "C"
-void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
+void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
    int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

 #endif

--- a/source/tensor/core/getandset/ConvertDataType.cpp
+++ b/source/tensor/core/getandset/ConvertDataType.cpp
@@ -30,15 +30,15 @@ convert data type
 >> input - input tensor
 >> output - output tensor
 */
-void ConvertTensorDataType(XTensor * input, XTensor * output)
+void _ConvertDataType(const XTensor * input, XTensor * output)
 {
-    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
    if (input->dataType == output->dataType)
        return;
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (input->devID >= 0) {
-        CudaConvertDataType(input, output);
+        _CudaConvertDataType(input, output);
    return;
 }
 #endif

--- a/source/tensor/core/getandset/ConvertDataType.cu
+++ b/source/tensor/core/getandset/ConvertDataType.cu
@@ -78,7 +78,7 @@ data conversion (cuda code)
 >> typeT - target data type
 >> size - number of the items in s (and t)
 */
-void CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
+void _CudaConvertDataType(int devID, void * s, TENSOR_DATA_TYPE typeS, void * t, TENSOR_DATA_TYPE typeT, int size)
 {
    CheckNTErrors((devID >= 0), "This code must be run on GPUs!");

@@ -112,9 +112,9 @@ convert data type (cuda code)
 >> input - input tensor
 >> output - output tensor
 */
-void CudaConvertDataType(XTensor * input, XTensor * output)
+void _CudaConvertDataType(const XTensor * input, XTensor * output)
 {
-    CheckNTErrors(XTensor::IsIdentical(input, output), "Input and Output are different in type or size!");
+    CheckNTErrors((input->unitSize == output->unitSize), "Input and Output must be same in size!");
    if (input->dataType == output->dataType)
        return;


--- a/source/tensor/core/getandset/ConvertDataType.cuh
+++ b/source/tensor/core/getandset/ConvertDataType.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */

+#ifndef __CONVERTDATATYPE_CUH__
+#define __CONVERTDATATYPE_CUH__
+
 #include "ConvertDataType.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,8 +45,10 @@ __global__
 void KernelIntToFloat(int * inputData, float * outputData, int size);

 /* convert data type */
-void CudaConvertDataType(XTensor * input, XTensor * output);
+void _CudaConvertDataType(const XTensor * input, XTensor * output);

 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
+
+#endif // __CONVERTDATATYPE_H__
\ No newline at end of file
--- a/source/tensor/core/getandset/ConvertDataType.h
+++ b/source/tensor/core/getandset/ConvertDataType.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* convert data type */
-void ConvertDataType(XTensor * input, XTensor * output);
+void _ConvertDataType(const XTensor * input, XTensor * output);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/Select.cpp
+++ b/source/tensor/core/getandset/Select.cpp
@@ -26,8 +26,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-generate a tensor with seleccted data in range[low,high] along the given dimension 
+generate a tensor with selected data in range[low,high] along the given dimension 
+
 c = select(a) 
+
 >> a - input tensor
 >> c - result tensor
 >> dim - the dimension along with which we do the job
@@ -35,7 +37,7 @@ c = select(a)
 >> high - higher bound.
 Note that range [1,3] means that we select 1 and 2.
 */
-void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
+void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high)
 {
    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
@@ -76,4 +78,55 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    }
 }

+/* 
+generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+c = select(a) 
+
+>> a - input tensor
+>> dim - the dimension along with which we do the job
+>> low - lower bound
+>> high - higher bound.
+   Note that range [1,3] means that we select 1 and 2.
+<< return - the result of the generated tensor with selected data
+*/
+XTensor SelectRange(const XTensor &a, int dim, int low, int high)
+{
+    int order = a.order;
+    int * dimSize = new int[order];
+
+    CheckNTErrors(&a != NULL, "Empty input tensors!");
+    CheckNTErrors(dim >= 0 && dim < a.order, "The input dimension is out of bounds!");
+    CheckNTErrors(low < high, "Illegal range specified!");
+    
+    for(int i = 0; i < a.order; i++){
+        if(i == dim){
+            CheckNTErrors(low > 0 && low < a.dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a.dimSize[dim], "Illegal range specified!");
+            dimSize[i] = high - low;
+        }
+        else
+            dimSize[i] = a.dimSize[i];
+    }
+
+    XTensor c = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    c.SetZeroAll();
+    c.SetTMP();
+
+    /* call _SelectRange function */
+    _SelectRange(&a, &c, dim, low, high);
+
+    /* tensor connection */
+    XLink::MakeLink(&a, NULL, &c, GETANDSET_SELECT);
+    XLink::AddParamToHead(&c, low);
+    XLink::AddParamToHead(&c, high);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return c;
+}
+
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/getandset/Select.cuh
+++ b/source/tensor/core/getandset/Select.cuh
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
+*/
+
+#ifndef __SELECT_CUH__
+#define __SELECT_CUH__
+
+#include "Select.h"
+
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+
+/* generate a tensor with selected data c = select(a) */
+extern "C"
+void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
+
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension 
+c = select(a)
+*/
+extern "C" 
+void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
+
+} // namespace nts(NiuTrans.Tensor)
+
+#endif // __SELECT_CUH__
\ No newline at end of file
--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -26,14 +26,29 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

-/* generate a tensor with seleccted data c = select(a) */
+/* generate a tensor with selected data c = select(a) */
 extern "C"
-void Select(XTensor * a, XTensor * c, XTensor * indexCPU);
+void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);

-/* generate a tensor with seleccted data in range[low,high] along the given dimension 
-   c = select(a) */
+/* 
+generate a tensor with selected data c = select(a) (returna a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Select(const XTensor &a, XTensor &indexCPU);
+
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension 
+c = select(a)
+*/
 extern "C" 
-void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high);
+void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
+
+/* 
+generate a tensor with selected data in range[low,high] along the given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+c = select(a)
+*/
+XTensor SelectRange(const XTensor &a, int dim, int low, int high);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/getandset/SetData.cpp
+++ b/source/tensor/core/getandset/SetData.cpp
@@ -77,7 +77,7 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
    else{
        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);
        SetDataRand(t2, low, high);
-        CopyValues(t2, tensor);
+        _CopyValues(t2, tensor);
        delete t2;
    }
 }

--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -22,6 +22,7 @@
 #include "../../XTensor.h"
 #include "Log.h"
 #include "Log.cuh"
+#include <math.h>

 namespace nts { // namespace nts(NiuTrans.Tensor)

@@ -29,12 +30,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 set every entry to its log value
 >> a - the tensor we are processing
 */
-void Log(XTensor * a)
+void _Log(XTensor * a)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaLog(a);
+        _CudaLog(a);
    return;
 }
 #endif

--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
@@ -58,7 +58,7 @@ set each entry to its log value
 >> a - the tensor
 */
 extern "C"
-void CudaLog(XTensor * a)
+void _CudaLog(XTensor * a)
 {
    CheckNTErrors((a->isSparse == false), "TODO!");


--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -19,6 +19,9 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-7-11
 */

+#ifndef __LOG_CUH__
+#define __LOG_CUH__
+
 #include "Log.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,8 +38,10 @@ void KernelLog(__half * d, int size);

 /* set each entry to its log value */
 extern "C"
-void CudaLog(XTensor * a);
+void _CudaLog(XTensor * a);

 #endif // USE_CUDA

 } // namespace nts(NiuTrans.Tensor)
+
+#endif // __LOG_CUH__
\ No newline at end of file
--- a/source/tensor/core/math/Log.h
+++ b/source/tensor/core/math/Log.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set every entry to its log value */
 extern "C"
-void Log(XTensor * a);
+void _Log(XTensor * a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -21,15 +21,18 @@

 #include <math.h>
 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Normalize.h"
 #include "Normalize.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-normalized the data with normal distribution. For an input x,
-y = a * (x-mean)/sqrt(variance+\epsilon) + b
+normalized the data with normal distribution
+
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - dimension alone which we generate the mean and variance
@@ -39,7 +42,7 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon)
+void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
 	int dimRDI = input->order - dim - 1;
    CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!");
@@ -68,7 +71,7 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens

    if (input->devID >= 0 || output->devID >= 0) {
 #ifdef USE_CUDA
-        CudaNormalize(input, output, dim, mean, var, a, b, epsilon);
+        _CudaNormalize(input, output, dim, mean, var, a, b, epsilon);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -91,4 +94,61 @@ void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTens
        }
    }
 }
+
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tensor and return nothing
+
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+*/
+void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
+{
+    _Normalize(input, input, dim, mean, var, a, b, epsilon);
+}
+/*
+normalized the data with normal distribution (return a XTensor structure)
+make a new tensor to keep the result and return it 
+
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+
+>> input - the input tensor
+>> dim - dimension alone which we generate the mean and variance
+>> mean - the mean of the input
+>> var - the variance of the input
+>> a - the scalar
+>> b - the bias
+>> epsilon - a parameter
+<< return - the result of normalized the data with normal distribution
+*/
+XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon)
+{
+    XTensor output(&input);
+    output.SetTMP();
+
+    /* call _Normalize function */
+    _Normalize(&input, &output, dim, &mean, &var, &a, &b, epsilon);
+
+    /* tensor connections */
+    XList list(5);
+    list.Add(&input);
+    list.Add(&mean);
+    list.Add(&var);
+    list.Add(&a);
+    list.Add(&b);
+    XLink::MakeLink(&list, &output, MATH_NORMALIZE);
+    XLink::AddParamToHeadInt(&output, dim);
+    XLink::AddParamToHead(&output, epsilon);
+
+    return output;
+}
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -89,9 +89,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> epsilon - a parameter
 */
 extern "C"
-void CudaNormalize(XTensor * input, XTensor * output, int dim,
-                   XTensor * mean, XTensor * var,
-                   XTensor * a, XTensor * b,
+void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
+                   const XTensor * mean, const XTensor * var,
+                   const XTensor * a, const XTensor * b,
                   DTYPE epsilon)
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

--- a/source/tensor/core/math/Normalize.cuh
+++ b/source/tensor/core/math/Normalize.cuh
@@ -44,9 +44,9 @@ y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
 extern "C"
-void CudaNormalize(XTensor * input, XTensor * output, int dim,
-    XTensor * mean, XTensor * var,
-    XTensor * a, XTensor * b, DTYPE epsilon);
+void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
+    const XTensor * mean, const XTensor * var,
+    const XTensor * a, const XTensor * b, DTYPE epsilon);

 #endif // USE_CUDA


--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -27,12 +27,29 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-normalized the data with normal distribution. For an input x,
-y = a * (x-mean)/sqrt(variance+\epsilon) + b
+normalized the data with normal distribution. 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
 extern "C"
-void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon);
+void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+
+/*
+normalized the data with normal distribution (do it on site)
+keep the result in the input tenosr and return nothing
+For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+extern "C"
+void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
+
+/*
+normalized the data with normal distribution (return a XTensor structure)
+make a new tensor to keep the result and return it 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
+where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
+*/
+XTensor Normalize(const XTensor &input, int dim, const XTensor &mean, const XTensor &var, const XTensor &a, const XTensor &b, DTYPE epsilon);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/Power.cpp
+++ b/source/tensor/core/math/Power.cpp
@@ -31,12 +31,12 @@ get the power(a, p)
 >> a - the tensor
 >> p - as it is
 */
-void Power(XTensor * a, DTYPE p)
+void _Power(XTensor * a, DTYPE p)
 {
 #ifdef USE_CUDA
    /* run it on GPUs */
    if (a->devID >= 0) {
-        CudaPower(a, p);
+        _CudaPower(a, p);
        return;
    }
 #endif

--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -96,7 +96,7 @@ void KernelPower(__half * d, __half p, int size)

 /* get the power of the entries */
 extern "C"
-void CudaPower(XTensor * a, DTYPE p)
+void _CudaPower(XTensor * a, DTYPE p)
 {
    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
@@ -38,7 +38,7 @@ void KernelSqrtV2(__half * d, int size);

 /* get the power of the entries */
 extern "C"
-void CudaPower(XTensor * a, DTYPE p);
+void _CudaPower(XTensor * a, DTYPE p);

 #endif // USE_CUDA


--- a/source/tensor/core/math/Power.h
+++ b/source/tensor/core/math/Power.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* get the power(x, y) */
 extern "C"
-void Power(XTensor * a, DTYPE p);
+void _Power(XTensor * a, DTYPE p);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -28,8 +28,10 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-scale and shift all tensor entires b = a * scale + shift
+scale and shift all tensor entires
+
 b = a * scale + shift
+
 >> a - the input tensor
 >> b - the output tensor
 >> scale - the scaler factor
@@ -76,8 +78,11 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
 }

 /* 
-scale and shift all tensor entires on site b = a * scale + shift
-b = a * scale + shift
+scale and shift all tensor entires (do it on site)
+keep the result in the input tensor a and return nothing
+
+a = a * scale + shift
+
 >> a - the input/output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
@@ -88,19 +93,22 @@ void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift)
 }

 /* 
-scale and shift all tensor entires b = a * scale + shift
+scale and shift all tensor entires (return a XTensor structure)
+make a new tensor to keep the result and return it
+
 b = a * scale + shift
+
 >> a - the input tensor
->> b - the output tensor
 >> scale - the scaler factor
 >> shift - the shift factor
+<< return - the result of scaling and shifting all tensor entires
 */
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift)
 {
    XTensor b(&a);
    b.SetTMP();
    
-    /* computation */
+    /* call _ScaleAndShift function */
    _ScaleAndShift(&a, &b, scale, shift);
    
    /* tensor connections */

--- a/source/tensor/core/math/ScaleAndShift.h
+++ b/source/tensor/core/math/ScaleAndShift.h
@@ -30,13 +30,24 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #define _LinearMe _ScaleAndShiftMe
 #define  Linear    ScaleAndShift

-/* scale and shift all tensor entires b = a * scale + shift */
+/* 
+scale and shift all tensor entires 
+b = a * scale + shift 
+*/
 void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift = 0);

-/* scale and shift all tensor entires on site a = a * scale + shift */
+/*
+scale and shift all tensor entires
+keep the result in the input tensor a and return nothing
+a = a * scale + shift 
+*/
 void _ScaleAndShiftMe(XTensor * a, DTYPE scale, DTYPE shift = 0);

-/* scale and shift all tensor entires b = a * scale + shift, and return the result tensor b */
+/*
+scale and shift all tensor entires
+make a new tensor to keep the result and return it
+b = a * scale + shift 
+*/
 XTensor ScaleAndShift(const XTensor &a, DTYPE scale, DTYPE shift = 0);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -36,7 +36,7 @@ copy a number of blocks to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA
@@ -44,7 +44,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
        int * targetBlocksTMP = (int*)myMem->AllocBuf(myMem->devID, blockNum * sizeof(int));
        XMemCopy(targetBlocksTMP, myMem->devID, targetBlocks, -1, blockNum * sizeof(int));

-        CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocksTMP, myMem);

        myMem->ReleaseBuf(myMem->devID, blockNum * sizeof(int));
 #else
@@ -52,7 +52,7 @@ void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int *
 #endif
    }
    else {
-        CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CopyBlocksOnSite(source, blockSize, blockNum, target, targetBlocks, myMem);
    }
 }

@@ -66,14 +66,14 @@ copy a number of blocks source source positions to target positions
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
+void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    if (myMem != NULL)
        CheckNTErrors((myMem->devID == devID), "DevIDs are different between memory pool and input devID!");

    if (devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
+        _CudaCopyBlocksSelected(source, blockSize, sourceBlocks, blockNum, target, targetBlocks, myMem, devID);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocks.h
+++ b/source/tensor/core/movement/CopyBlocks.h
@@ -27,10 +27,10 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks to target positions */
-void CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

 /* copy a number of blocks from source positions to target positions */
-void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
+void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -38,7 +38,7 @@ Note that a grid may have a number of blocks
 >> myMem - the memory pool
 >> isIndexOnDev - indicates whether the index is on the device already
 */
-void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
+void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
                      int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
 {
    CheckNTErrors((unitSize == sizeof(int)), "TODO!");
@@ -51,7 +51,7 @@ void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, v
            XMemCopy(indexGPU, myMem->devID, index, -1, blockNum * gridNum * sizeof(int));
        }

-        CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);
+        _CudaCopyBlocksInGrid(source, blockSize, blockNum, gridNum, target, indexGPU, unitSize, myMem);

        if (!isIndexOnDev)
            myMem->ReleaseBuf(myMem->devID, blockNum * gridNum * sizeof(int));

--- a/source/tensor/core/movement/CopyBlocksInGrid.cu
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cu
@@ -216,7 +216,7 @@ Note that a grid may have a number of blocks
 >> itemSize - size of each data item
 >> myMem - the memory pool
 */
-void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem)
+void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int itemSize, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL && myMem->devID >= 0), "This code must be run on GPUs!");
    CheckNTErrors((itemSize == sizeof(int)), "TODO!");

--- a/source/tensor/core/movement/CopyBlocksInGrid.cuh
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy data by index */
 extern "C"
-void CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
+void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyBlocksInGrid.h
+++ b/source/tensor/core/movement/CopyBlocksInGrid.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks in grid */
 extern "C"
-void CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
+void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyBlocksOnSite.cpp
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cpp
@@ -36,11 +36,11 @@ all the data has been on the device (CPU/GPU) already.
 >> targetBlocks - target positions of the copy
 >> myMem - the memory pool
 */
-void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA
-        CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
+        _CudaCopyBlocks(source, blockSize, blockNum, target, targetBlocks, myMem);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -80,7 +80,7 @@ copy a number of blocks to target positions (cuda version)
 >> targetBlocks - target positions of the copy (on the device)
 >> myMem - memory pool
 */
-void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");
    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");

--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * targe

 /* copy a number of blocks to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks to target positions (on site) */
 extern "C"
-void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
+void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -70,7 +70,7 @@ copy a number of blocks from source positions to target positions (cuda version)
 >> targetBlocks - target positions of the copy
 >> myMem - memory pool
 */
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
+void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
    CheckNTErrors((devID >= 0), "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");

--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks,

 /* copy a number of blocks form source positions to target positions (cuda version) */
 extern "C"
-void CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
+void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyData2D.cpp
+++ b/source/tensor/core/movement/CopyData2D.cpp
@@ -36,7 +36,7 @@ copy data blocks by 2d layout
 >> n - height of each block
 >> myMem - the memory pool
 */
-void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem)
+void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int blockNum, int mSize, int n, XMem * myMem)
 {
    int devID = myMem != NULL ? myMem->devID : -1;


--- a/source/tensor/core/movement/CopyData2D.h
+++ b/source/tensor/core/movement/CopyData2D.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy data blocks by 2d layout */
 extern "C"
-void CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
+void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -36,7 +36,7 @@ in the k-th grid
 >> blockNumInGrid - number of blocks in each grid
 >> isIndexOnDev - indicates whether the index is on the device already
 */
-void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
+void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {
    CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!");

@@ -50,7 +50,7 @@ void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNu
    CheckNTErrors((s->unitNum % (blockSize * blockNum) == 0), "Illegal block number!");
    gridNum = s->unitNum / (blockSize * blockNum);

-    CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem);
+    _CopyBlocksInGrid(s->data, blockSize, blockNum, gridNum, t->data, index, s->unitSize, isIndexOnDev, s->mem);
 }

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/movement/CopyInGrid.h
+++ b/source/tensor/core/movement/CopyInGrid.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
 extern "C"
-void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
+void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -21,11 +21,13 @@

 #include "CopyIndexed.h"
 #include "CopyBlocks.h"
+#include "../../XName.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 copy indexed sub-tensors
+
 >> s - the source tensor
 >> t - the target tensor
 >> dim - the leading dimension to define "sub-tensors"
@@ -34,11 +36,11 @@ copy indexed sub-tensors
 >> srcIndex - index of the source sub-tensors
 >> indexSize - length of srcIndex (and tgtIndex)
 >> tgtIndex - index of the target sub-tensors
->> copyNum - number of the sub-tensors we copy for each source index, e.g.,
-for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
-<< return - whether copy indexed operation was successful
+>> copyNum - number of the sub-tensors we copy for each source index, 
+   e.g., for srcIndex = [1,4] and copyNum = 2,
+   we actually copy the source sub-tensors 1, 2, 4, 5
 */
-bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -84,12 +86,62 @@ bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSiz
        CheckNTErrors((tgtIndex[i] < blockNumTgt), "Index is out of range!");
    }

-    CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);
+    _CopyBlocks(s->data, blockSizeSrc * s->unitSize, realSrcIndex, realIndexSize, t->data, realTgtIndex, s->mem, s->devID);

    delete[] realSrcIndex;
    delete[] realTgtIndex;
+}
+
+/*
+copy indexed sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor
+>> dim - the leading dimension to define "sub-tensors"
+         e.g., for a tensor of size (3, 2, 4) and dim = 2, 
+         we have 4 sub-tensors of size (3,2)
+>> srcIndex - index of the source sub-tensors
+>> indexSize - length of srcIndex (and tgtIndex)
+>> tgtIndex - index of the target sub-tensors
+>> copyNum - number of the sub-tensors we copy for each source index, 
+   e.g., for srcIndex = [1,4] and copyNum = 2,
+   we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - the result of copying indexed sub-tensors
+*/
+XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
+{
+    CheckNTErrors(&s, "Empty input tensor!");
+    CheckNTErrors((dim >= 0 && dim < s.order), "A too larget dimension specified!");
+
+    int order = s.order;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < s.order; i++) {
+        if (i == dim)
+            dimSize[i] = indexSize * copyNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
+    
+    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    t.SetZeroAll();
+    t.SetTMP();
+
+    /* call _CopyIndexed function */
+    _CopyIndexed(&s, &t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+
+    /* destroy variables */
+    delete dimSize;
+    
+    /* tensor connection */
+    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYINDEXED);
+    XLink::AddParamToHead(&t, dim);
+    XLink::AddParamToHeadPointer(&t, srcIndex);
+    XLink::AddParamToHead(&t, indexSize);
+    XLink::AddParamToHeadPointer(&t, tgtIndex);
+    XLink::AddParamToHead(&t, copyNum);

-    return true;
+    return t;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy selected sub-tensors */
 extern "C"
-bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
+
+/* 
+copy selected sub-tensors (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor CopyIndexed(const XTensor &s, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/movement/CopyValues.cpp
+++ b/source/tensor/core/movement/CopyValues.cpp
@@ -27,18 +27,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 copy s to t
+
 >> s - source
 >> t - target
 >> stream - the stream for creating the job pipeline
-<< return - succeeded or not
 */
-bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
 {
-    if (s == NULL || t == NULL)
-        return false;
-    if (s->data == NULL || t->data == NULL)
-        return false;
-
+    CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
+    CheckNTErrors((s->data != NULL), "Cannot copy from an empty data array!");
    CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
    CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");

@@ -48,12 +45,13 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
            "The code must be run on the same device!");
        CheckNTErrors((s->isSparse || t->isSparse), "TODO!");
        ConvertDataType(s->devID, s->data, s->dataType, t->data, t->dataType, s->unitNum);
-        return true;
    }

 #ifdef USE_CUDA
-    if (s->devID >= 0 || t->devID >= 0)
-        return CudaCopyValues(s, t, stream);
+    if (s->devID >= 0 || t->devID >= 0) {
+        _CudaCopyValues(s, t, stream);
+        return;
+    }
 #endif

    if (!s->isSparse && !t->isSparse) {
@@ -68,8 +66,28 @@ bool CopyValues(const XTensor * s, XTensor * t, XStream * stream)
    else {
        ShowNTErrors("TODO!");
    }
+}
+
+/*
+copy s to t (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - source
+>> stream - the stream for creating the job pipeline
+<< return - the copyed tensor t
+*/
+XTensor CopyValues(const XTensor &s, XStream * stream)
+{
+    XTensor t(&s);
+    t.SetTMP();
+
+    /* call _CopyValues function */
+    _CopyValues(&s, &t, stream);
+        
+    /* tensor connection */
+    XLink::MakeLink(&s, NULL, &t, MOVEMENT_COPYVALUES);

-    return true;
+    return t;
 }

 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/movement/CopyValues.cu
+++ b/source/tensor/core/movement/CopyValues.cu
@@ -35,11 +35,9 @@ copy a range of elements from a source vector to a target vector
 >> stream - the stream for creating the job pipeline
 << return - succeed or not
 */
-bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
+void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
 {
-    if (s == NULL || t == NULL)
-        return false;
-
+    CheckNTErrors((s != NULL && t != NULL), "The input tensor and output tensor must be nonempty!");
    CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
    CheckNTErrors((s->unitSize == t->unitSize), "Incompatible vectors in value copy.");
    CheckNTErrors((s->denseRatio <= s->denseRatio), "Incompatible vectors in value copy.");
@@ -83,8 +81,6 @@ bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
    else {
        ShowNTErrors("TODO!");
    }
-
-    return true;
 }



--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy all elements from a source matrix to a target matrix */
 extern "C"
-bool CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);

 #endif // USE_CUDA


--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -28,7 +28,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* copy s to t */
 extern "C"
-bool CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
+
+/* 
+copy s to t (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor CopyValues(const XTensor &s, XStream * stream = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMax.cpp
+++ b/source/tensor/core/reduce/ReduceMax.cpp
@@ -27,12 +27,13 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-get the max value of the items along a dimension of the tensor. 
+get the max value of the items along a dimension of the tensor
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 */
-void ReduceMax(XTensor * input, XTensor * output, int dim)
+void _ReduceMax(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
                  "This code must be run on the same device!");
@@ -55,7 +56,7 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)

    if(input->devID >= 0){
 #ifdef USE_CUDA
-        CudaReduceMax(input, output, dim);
+        _CudaReduceMax(input, output, dim);
 #endif
    }
    else{
@@ -90,4 +91,43 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
    }
 }

+/* 
+get the max value of the items along a dimension of the tensor (return a XTensor structure).
+make a new tensor to keep the result and return it
+
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+<< return - the max value of the items along a dimension of the tensor
+*/
+XTensor ReduceMax(const XTensor &input, int dim)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+
+    /* call _ReduceMax function */
+    _ReduceMax(&input, &output, dim);
+
+    /* destroy variables */
+    delete dimSize;
+
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMAX);
+    XLink::AddParamToHead(&output, dim);
+
+    return output;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -334,7 +334,7 @@ sum_i = max_{0<=j<strideNum} input_{i,j}
 >> output - the output tensor
 >> dim - which dimension to reduce
 */
-void CudaReduceMax(XTensor * input, XTensor * output, int dim)
+void _CudaReduceMax(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input && output), "Empty input or output tensors!");
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");

--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -30,7 +30,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* get the max-valued items along a dimension of the tensor (cuda version) */
 extern "C" 
-void CudaReduceMax(XTensor * input, XTensor * output, int dim);
+void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);

 #endif // USE_CUDA


--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -28,7 +28,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* get the max value of the items along a dimension of the tensor. */
 extern "C"
-void ReduceMax(XTensor * input, XTensor * output, int dim);
+void _ReduceMax(const XTensor * input, XTensor * output, int dim);
+
+/* 
+get the max value of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor ReduceMax(const XTensor &input, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceMean.cpp
+++ b/source/tensor/core/reduce/ReduceMean.cpp
@@ -20,27 +20,71 @@
 */

 #include "../math/ScaleAndShift.h"
+#include "../../XName.h"
 #include "ReduceSum.h"
 #include "ReduceMean.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
-mean = (1/n) * sum_i input_i
+get the mean value along a dimension of the tensor
+
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 */
-void ReduceMean(XTensor * input, XTensor * output, int dim)
+void _ReduceMean(const XTensor * input, XTensor * output, int dim)
 {
    CheckNTErrors((input->order > dim), "Illegal dimension specified!");

 	int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];

-    ReduceSum(input, output, dim);
+    _ReduceSum(input, output, dim);
    _ScaleAndShiftMe(output, (DTYPE)1/num, 0);
 }

+/* 
+get the mean value along a dimension of the tensor (return a XTensor structure)
+make a new tenosr to keep the result and return it
+
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+<< return - the mean value along a dimension of the tensor
+*/
+XTensor ReduceMean(const XTensor &input, int dim)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+
+    /* call _ReduceMean function */
+    _ReduceMean(&input, &output, dim);
+        
+    /* tensor connection */
+    XLink::MakeLink(&input, NULL, &output, REDUCE_REDUCEMEAN);
+    XLink::AddParamToHead(&output, dim);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return output;
+}
+
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -27,11 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
-mean = (1/n) * sum_i input_i
+get the mean value along a dimension of the tensor
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
 extern "C"
-void ReduceMean(XTensor * input, XTensor * output, int dim);
+void _ReduceMean(const XTensor * input, XTensor * output, int dim);
+
+/* 
+get the mean value along a dimension of the tensor (return a XTensor structure)
+make a new tenosr to keep the result and return it
+For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
+*/
+XTensor ReduceMean(const XTensor &input, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceStandardVariance.h
+++ b/source/tensor/core/reduce/ReduceStandardVariance.h
@@ -27,11 +27,12 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-standard variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
+standard variance of the items along a dimension of the tensor
+For a 1-dimensional data array a,
 variance = (1/n * \sum_i (a_i - mean)^2)^0.5
 */
 extern "C"
-void ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
+void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -27,9 +27,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-sum the items along a dimension of the tensor. For a 1-dimensional data array a,
+sum the items along a dimension of the tensor
+
+For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^power if isExp == false
 sum = \sum_i exp((a_i - shift)^power) if isExp == true
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
@@ -37,7 +40,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 >> ieExp - specify if the exp() is performed
 >> power - we perform pow(item_i, power) on each item in the array
 */
-void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYPE power, bool isExp)
+void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
 {
    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
                        "This code must be run on the same device!");
@@ -61,7 +64,7 @@ void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYP

    if(input->devID >= 0){
 #ifdef USE_CUDA
-        CudaReduceSum(input, output, dim, shift, power, isExp);
+        _CudaReduceSum(input, output, dim, shift, power, isExp);
 #endif
    }
    else{
@@ -194,4 +197,51 @@ void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYP
    }
 }

+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift)^power if isExp == false
+sum = \sum_i exp((a_i - shift)^power) if isExp == true
+
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+>> shift - shift the input
+>> ieExp - specify if the exp() is performed
+>> power - we perform pow(item_i, power) on each item in the array
+<< return - the sum along a dimension of the tensor
+*/
+XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift, DTYPE power, bool isExp)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+
+    /* call _ReduceSum function */
+    _ReduceSum(&input, &output, dim, &shift, power, isExp);
+            
+    /* tensor connection */
+    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUM);
+    XLink::AddParamToHead(&output, dim);
+    XLink::AddParamToHead(&output, power);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return output;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceSum.cu
+++ b/source/tensor/core/reduce/ReduceSum.cu
@@ -442,7 +442,7 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 >> power - we perform pow(item_i, power) on each item
 >> ieExp - specify if the exp() is performed
 */
-void CudaReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYPE power, bool isExp)
+void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
 {
    CheckNTErrors((input && output), "Empty input or output tensors!");
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");

--- a/source/tensor/core/reduce/ReduceSum.cuh
+++ b/source/tensor/core/reduce/ReduceSum.cuh
@@ -29,13 +29,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* 
-sum the items along a dimension of the tensor (cuda version) 
+sum the items along a dimension of the tensor (cuda version).
 For a 1-dimensional data array a,
 sum = \sum_i ((a_i + shift)^power) if isExp == false
 sum = \sum_i exp((a_i + shift)^power) if isExp == true
 */
 extern "C" 
-void CudaReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYPE power, bool isExp);
+void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp);

 #endif // USE_CUDA


--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -27,12 +27,23 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-sum the items along a dimension of the tensor. For a 1-dimensional data array a,
+sum the items along a dimension of the tensor
+For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
 extern "C"
-void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
+void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL,
+    DTYPE power = (DTYPE)1.0F, bool isExp = false);
+
+/* 
+sum the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a,
+sum = \sum_i (a_i - shift) if isExp == false
+sum = \sum_i exp(a_i - shift) if isExp == true
+*/
+XTensor ReduceSum(const XTensor &input, int dim, const XTensor &shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceSumSquared.cpp
+++ b/source/tensor/core/reduce/ReduceSumSquared.cpp
@@ -19,23 +19,68 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */

+#include "../../XName.h"
 #include "ReduceSum.h"
 #include "ReduceSumSquared.h"

 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-squared sum of the items along a dimension of the tensor. 
-For a 1-dimensional data array a,
-sum = \sum_i (a_i - shift)^2
+squared sum of the items along a dimension of the tensor
+
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> shift - bias on the input
 */
-void ReduceSumSquared(XTensor * input, XTensor * output, int dim, XTensor * shift)
+void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift)
 {
-    ReduceSum(input, output, dim, shift, 2.0F);
+    _ReduceSum(input, output, dim, shift, 2.0F);
+}
+
+
+/* 
+squared sum of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
+
+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+>> shift - bias on the input
+<< return - the squared sum of the items along a dimension of the tensor
+*/
+XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+
+    /* call _ReduceSumSquared function */
+    _ReduceSumSquared(&input, &output, dim, &shift);
+            
+    /* tensor connection */
+    XLink::MakeLink(&input, &shift, &output, REDUCE_REDUCESUMSQUARED);
+    XLink::AddParamToHead(&output, dim);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return output;
 }

-} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
+ } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
@@ -27,11 +27,19 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-squared sum of the items along a dimension of the tensor. For a 1-dimensional data array a,
+squared sum of the items along a dimension of the tensor
+For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 */
 extern "C"
-void ReduceSumSquared(XTensor * input, XTensor * output, int dim, XTensor * shift);
+void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift);
+
+/* 
+squared sum of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2
+*/
+XTensor ReduceSumSquared(const XTensor &input, int dim, const XTensor &shift);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/reduce/ReduceVariance.cpp
+++ b/source/tensor/core/reduce/ReduceVariance.cpp
@@ -26,21 +26,58 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-variance of the items along a dimension of the tensor. 
-For a 1-dimensional data array a,
-variance = 1/n * \sum_i (a_i - mean)^2
+variance of the items along a dimension of the tensor
+
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
+
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
 >> mean - the mean value
 */
-void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean)
+void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean)
 {
 	int dimRDI = input->order - dim - 1;
    int num = input->dimSizeRDI[dimRDI];
-    ReduceSum(input, output, dim, mean, 2.0F);
+    _ReduceSum(input, output, dim, mean, 2.0F);
    _ScaleAndShiftMe(output, (DTYPE)1 / num, 0);
 }

-} // namespace nts(NiuTrans.Tensor)
+/* 
+variance of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2

+>> input - the input tensor
+>> dim - the dimension where the reduction is performed on
+>> mean - the mean value
+<< return - the variance of the items along a dimension of the tensor
+*/
+XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean)
+{
+    CheckNTErrors(&input, "Empty input or output tensors!");
+    CheckNTErrors((dim >= 0 && dim < input.order), "Illegal dimension to reduce!");
+	
+    int order = input.order - 1;
+    int * dimSize = new int[order];
+    for(int i = 0; i < input.order; i++){
+        if(i < dim)
+            dimSize[i] = input.dimSize[i];
+        else if(i > dim)
+            dimSize[i] = input.dimSize[i + 1];
+    }
+
+    XTensor output = NewTensor(order, dimSize, input.dataType, input.denseRatio, input.devID, input.mem);
+    output.SetZeroAll();
+    output.SetTMP();
+
+    /* call _ReduceVariance function */
+    _ReduceVariance(&input, &output, dim, &mean);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return output;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
@@ -27,11 +27,18 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
-variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
-variance = 1/n * \sum_i (a_i - mean)^2
+variance of the items along a dimension of the tensor
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
 extern "C"
-void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
+void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean);
+
+/* 
+variance of the items along a dimension of the tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
+*/
+XTensor ReduceVariance(const XTensor &input, int dim, const XTensor &mean);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -20,6 +20,7 @@
 */

 #include "../../XTensor.h"
+#include "../../XName.h"
 #include "Concatenate.h"
 #include "Merge.h"
 #include "ConcatenateSolely.h"
@@ -28,13 +29,15 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 concatenate a list of tensors along a given dimension
+
 Note that this is actually a wrapper that selects "ConcatenateSolely"
 or "Merge" by means of the tensor shapes
+
 >> smalls - a list of tensors for concatenation
 >> big - the resulting tensor
 >> dim - which dimension we perform the concatenation
 */
-void Concatenate(XList * smalls, XTensor * big, int dim)
+void _Concatenate(const XList * smalls, XTensor * big, int dim)
 {
    bool uniform = true;
    for (int i = 1; i < smalls->count; i++) {
@@ -46,25 +49,131 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
    }

    if (uniform)
-        Merge(smalls, big, dim);
+        _Merge(smalls, big, dim);
    else
-        ConcatenateSolely(smalls, big, dim);
+        _ConcatenateSolely(smalls, big, dim);
+}
+
+/*
+concatenate a list of tensors along a given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+Note that this is actually a wrapper that selects "ConcatenateSolely"
+or "Merge" by means of the tensor shapes
+
+>> smalls - a list of tensors for concatenation
+>> big - the resulting tensor
+>> dim - which dimension we perform the concatenation
+<< return - the tensor of concatenating a list of tensors along a given dimension
+*/
+XTensor Concatenate(const XList &smalls, int dim)
+{
+    CheckNTErrors(&smalls != NULL, "Invalid list!");
+    CheckNTErrors((smalls.count > 0), "Empty list!");
+    CheckNTErrors(dim >= 0, "Illegal dimension to concatenate!");
+
+    bool uniform = true;
+    for (int i = 1; i < smalls.count; i++) {
+        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
+        XTensor * b = (XTensor*)smalls.GetItem(i);
+        CheckNTErrors((a && b), "Empty input tensors!");
+        if (!XTensor::IsIdentical(a, b))
+            uniform = false;
+    }
+
+    int * dimSize;
+    if (uniform) {
+        XTensor * tensor = (XTensor*)smalls.GetItem(0);
+        int order = tensor->order;
+        dimSize = new int[order];
+        for (int i = 0; i < tensor->order; i++) {
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+            else
+                dimSize[i] = tensor->dimSize[dim] * smalls.count;
+        }
+        XTensor big = XTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+        
+        big.SetZeroAll();
+        big.SetTMP();
+        
+        /* call _Merge function */
+        _Merge(&smalls, &big, dim);
+                
+        ///* tensor connection */
+        //XLink::MakeLink(&smalls, &big, SHAPE_CONCATENATE);
+        //XLink::AddParamToHead(&big, dim);
+
+        /* destroy variables */
+        delete dimSize;
+        
+        return big;
+    }
+    else {
+        XTensor * tensor = (XTensor*)smalls.GetItem(0);
+        int order = tensor->order;
+        dimSize = new int[order];
+
+        for (int i = 0; i < tensor->order; i++)
+            if (i != dim)
+                dimSize[i] = tensor->dimSize[i];
+
+        int catDimSize = 0;
+        for (int i = 0; i < smalls.count; i++) {
+            XTensor * tensor = (XTensor*)smalls.GetItem(i);
+            catDimSize += tensor->dimSize[dim];
+        }
+        dimSize[dim] = catDimSize;
+
+        XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+        big.SetZeroAll();
+        big.SetTMP();
+
+        /* call _ConcatenateSolely function */
+        _ConcatenateSolely(&smalls, &big, dim);
+
+        /* destroy variables */
+        delete dimSize;
+
+        return big;
+    }
 }

 /*
 concatenate two tensors along a given dimension
+
 >> smallA - one tensor for concatenation
 >> smallB - the other tensor for concatenation
 >> big - the resulting tensor
 >> dim - which dimension we perform the concatenation
 */
-void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
+void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim)
 {
    XList smalls(2);
    smalls.Add(smallA);
    smalls.Add(smallB);

-    Concatenate(&smalls, big, dim);
+    _Concatenate(&smalls, big, dim);
+}
+
+/*
+concatenate two tensors along a given dimension (return a XTensor structure).
+make a new tensor to keep the result and return it.
+
+>> smallA - one tensor for concatenation
+>> smallB - the other tensor for concatenation
+>> big - the resulting tensor
+>> dim - which dimension we perform the concatenation
+<< return - the tensor of concatenating two tensor along a given dimension
+*/
+XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
+{
+    XList smalls(2);
+    smalls.Add(&smallA);
+    smalls.Add(&smallB);
+
+    /* call Concatenate function */
+    return Concatenate(smalls, dim);
 }

 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/tensor/core/shape/Concatenate.h
+++ b/source/tensor/core/shape/Concatenate.h
@@ -28,13 +28,27 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 concatenate a list of tensors along a given dimension
-Note that this is actually a wrapper that selects "ConcatenateSolely"
-or "Merge" by means of the tensor shapes 
+Note that this is actually a wrapper that selects 
+"ConcatenateSolely" or "Merge" by means of the tensor shapes 
 */
-void Concatenate(XList * smalls, XTensor * big, int dim);
+void _Concatenate(const XList * smalls, XTensor * big, int dim);
+
+/*
+concatenate a list of tensors along a given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+Note that this is actually a wrapper that selects 
+"ConcatenateSolely" or "Merge" by means of the tensor shapes 
+*/
+XTensor Concatenate(const XList &smalls, int dim);

 /* concatenate two tensors along a given dimension */
-void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim);
+void _Concatenate(const XTensor * smallA, const XTensor * smallB, XTensor * big, int dim);
+
+/* 
+concatenate two tensors along a given dimension (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/ConcatenateSolely.cpp
+++ b/source/tensor/core/shape/ConcatenateSolely.cpp
@@ -29,11 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 concatenate a list of tensors along a given dimension
+
 >> smalls - a list of tensors for concatenation
 >> big - the resulting tensor
 >> dim - which dimension we perform the concatenation
 */
-void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
+void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim)
 {
    CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");

@@ -92,7 +93,7 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
            sourceArrays->Add(tensor->data);
        }

-        MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);
+        _MergeBlockLists(sourceArrays, blockSizes, blockNum, big->data, big->mem);

        delete[] blockSizes;
        delete sourceArrays;

--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* concatenate a list of tensors along a given dimension */
 extern "C"
-void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
+void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
@@ -35,12 +35,12 @@ set target data block index for the data movement in merge
 >> gridNum - number of grids
 >> mem - the memory pool
 */
-void MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
+void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
                         int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
 {
    if (mem != NULL && mem->devID >= 0) {
 #ifdef USE_CUDA
-        CudaMakeMergeBlockIndex(mem->devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
+        _CudaMakeMergeBlockIndex(mem->devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
@@ -70,7 +70,7 @@ set target data block index for the data movement in split
 >> mem - the memory pool
 */
 extern "C"
-void CudaMakeMergeBlockIndex(int devID,
+void _CudaMakeMergeBlockIndex(int devID,
                             int * blockIndex, int blockNum, int blockNumInMerge,
                             int splitSizeInGrid, int gridSize, int gridNum)
 {

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
 extern "C"
-void CudaMakeMergeBlockIndex(int devID,
+void _CudaMakeMergeBlockIndex(int devID,
    int * blockIndex, int blockNum, int blockNumInMerge,
    int splitSizeInGrid, int gridSize, int gridNum);


--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in merge */
 extern "C"
-void MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
+void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
    int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);

 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
@@ -33,11 +33,11 @@ set target data block index for the data movement in split
 >> blockNum - number of data blocks
 >> mem - the memory pool
 */
-void MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem)
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem)
 {
    if (mem != NULL && mem->devID >= 0) {
 #ifdef USE_CUDA
-        CudaMakeSplitBlockIndex(mem->devID, blockIndex, splitNum, blockSplitSize, blockNum);
+        _CudaMakeSplitBlockIndex(mem->devID, blockIndex, splitNum, blockSplitSize, blockNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cu
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cu
@@ -58,7 +58,7 @@ set target data block index for the data movement in split
 >> blockNum - number of data blocks
 */
 extern "C"
-void CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
+void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
 {
    int cudaGrids[3];
    int cudaBlocks[3];

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
 extern "C"
-void CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);
+void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);

 #endif // USE_CUDA


--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
 extern "C"
-void MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -29,15 +29,18 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M)
+transform a tensor by merging it along with a dimension.
+
+e.g., (N/3, M, 3) -> (N, M)
+
 >> s - the source tensor
 >> t - the target tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
->> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) for example
-whereToMerge = 0 (i.e., the dimension for "N/3")
-leadingDim = 2 (i.e., the dimension for "3")
+>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
+   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
+   leadingDim = 2 (i.e., the dimension for "3")
 */
-void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
+void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
 	int whereToMergeRDI = s->order - whereToMerge - 1;
 	int leadingDimRDI = s->order - leadingDim - 1;
@@ -120,9 +123,9 @@ void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
            mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
            XMemAlloc(mem->devID, blockNum * gridNum * sizeof(int)));

-        MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
+        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);

-        CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);

        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
@@ -144,12 +147,60 @@ void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 }

 /*
+transform a tensor by merging it along with a dimension (return a XTensor structure)
+make a new tensor to keep the result and  return it
+
+e.g., (N/3, M, 3) -> (N, M)
+
+>> s - the source tensor
+>> whereToMerge - the merging operation is along with which dimension
+>> leadingDim - the leading dimension of merging, take (N/3, M, 3) -> (N, M) 
+   for example, whereToMerge = 0 (i.e., the dimension for "N/3")
+   leadingDim = 2 (i.e., the dimension for "3")
+<< return - the transformed tensor by merging along with a dimension
+*/
+XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
+{
+    CheckNTErrors(&s != NULL, "Invalid tensors!");
+    CheckNTErrors((leadingDim < whereToMerge), "Invalid leading dimension!");
+    
+    if (leadingDim < 0)
+		leadingDim = 0;
+    int order = s.order - 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < s.order; i++) {
+        if (i < leadingDim) 
+            dimSize[i] = s.dimSize[i];
+        else if (i > leadingDim) {
+            if (i != whereToMerge)
+                dimSize[i - 1] = s.dimSize[i];
+            else
+                dimSize[i - 1] = s.dimSize[i] * s.dimSize[leadingDim];
+        }
+    }
+
+    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    t.SetZeroAll();
+    t.SetTMP();
+
+    /* call _Merge function */
+    _Merge(&s, &t, whereToMerge, leadingDim);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return t;
+}
+
+/*
 merge small tensors into a big tensor
+
 >> smalls - the list of the small tensors
 >> big - the merged tensor (for return)
 >> whereToMerge - the merging operation is along with which dimension
 */
-void Merge(XList * smalls, XTensor * big, int whereToMerge)
+void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
 {
 	CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count > 0), "Empty list!");
@@ -241,7 +292,7 @@ void Merge(XList * smalls, XTensor * big, int whereToMerge)
            }
        }

-        Merge(tensorTMP, big, whereToMerge);
+        _Merge(tensorTMP, big, whereToMerge);

        delete[] dimSizeTMP;
        tensorTMP->data = NULL;
@@ -255,4 +306,38 @@ void Merge(XList * smalls, XTensor * big, int whereToMerge)
            XMemFree(mem->devID, dataTMP);
    }
 }
+
+/*
+merge small tensors into a big tensor (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> smalls - the list of the small tensors
+>> whereToMerge - the merging operation is along with which dimension
+<< return - the big tensor merged by small tensors
+*/
+XTensor Merge(const XList &smalls, int whereToMerge)
+{
+    XTensor * tensor = (XTensor*)smalls.GetItem(0);
+    int order = tensor->order;
+    int * dimSize = new int[order];
+    for (int i = 0; i < tensor->order; i++) {
+        if (i != whereToMerge)
+            dimSize[i] = tensor->dimSize[i];
+        else
+            dimSize[i] = tensor->dimSize[whereToMerge] * smalls.count;
+    }
+
+    XTensor big = NewTensor(order, dimSize, tensor->dataType, tensor->denseRatio, tensor->devID, tensor->mem);
+    big.SetZeroAll();
+    big.SetTMP();
+
+    /* call _Merge function */
+    _Merge(&smalls, &big, whereToMerge);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return big;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Merge.h
+++ b/source/tensor/core/shape/Merge.h
@@ -27,10 +27,23 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */
-void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
+void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
+
+/* 
+transform a tensor by merging it alone with a dimension (return a XTensor structure).
+make a new tensor to keep the result and return it.
+e.g., (M, N/3, 3) -> (M, N) 
+*/
+XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim = -1);

 /* merge small tensors into a big tensor */
-void Merge(XList * smalls, XTensor * big, int whereToMerge);
+void _Merge(const XList * smalls, XTensor * big, int whereToMerge);
+
+/* 
+merge small tensors into a big tensor (return a XTensor structure).
+make a new tensor to keep the result and return it.
+*/
+XTensor Merge(const XList &smalls, int whereToMerge);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/MergeBlockLists.cpp
+++ b/source/tensor/core/shape/MergeBlockLists.cpp
@@ -34,11 +34,11 @@ merge data by blocks
 >> target - target data array
 >> myMem - memory pool
 */
-void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    if (myMem != NULL && myMem->devID >= 0) {
 #ifdef USE_CUDA
-        CudaMergeBlockLists(sourceList, blockSizes, blockNum, target, myMem);
+        _CudaMergeBlockLists(sourceList, blockSizes, blockNum, target, myMem);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -72,7 +72,7 @@ merge data by blocks (cuda version)
 >> myMem - the memory pool
 */
 extern "C"
-void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
+void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");
    CheckNTErrors((myMem->devID >= 0), "Wrong device to run!");

--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
@@ -34,7 +34,7 @@ void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourc

 /* merge data by blocks (cuda version) */
 extern "C"
-void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);

 #endif // USE_CUDA


--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* merge data by blocks */
 extern "C"
-void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
+void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Permute.h
+++ b/source/tensor/core/shape/Permute.h
@@ -28,17 +28,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #define permute _Permute_

-/* generate the tensor with permuted dimensions: b = permuted(a) */
-void Permute(XTensor * a, XTensor * b, int * dimPermute);
-    
-/* permute the tensor dimensions on site: a = permuted(a) */
-void Permute_(XTensor * a, int * dimPermute);
+/* 
+generate the tensor with permuted dimensions.
+b = permuted(a) 
+*/
+void _Permute(XTensor * a, XTensor * b, int * dimPermute);
    
-/* make a tensor with permuted dimensions: b = permuted(a) and return its pointer */
-XTensor * _Permute(XTensor *a, int * dimPermute);
+/* 
+permute the tensor dimensions (do it on site).
+keep the result in the input tensor and return nothing.
+a = permuted(a) 
+*/
+void _PermuteMe(XTensor * a, int * dimPermute);

-/* make a tensor with permuted dimensions: b = permuted(a) and return its body */
-XTensor&  _Permute_(XTensor &a, int * dimPermute);
+/* 
+make a tensor with permuted dimensions (return a XTensor structure).
+make a new tensor to keep the result and return it.
+b = permuted(a)
+*/
+XTensor Permute(XTensor &a, int * dimPermute);

    
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -29,12 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+
 >> s - the source tensor
 >> t - the target tensor (for return)
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
 */
-void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum)
+void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -108,9 +109,9 @@ void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
            XMemAlloc(mem->devID, blockNum * sizeof(int)));

-        MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, mem);
+        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, mem);

-        CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
+        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);

        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
@@ -130,15 +131,52 @@ void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum)
 }

 /*
-split a big tensor into small tensors
+transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3) (return a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> s - the source tensor
+>> whereToSplit - which dimension of the tensor is to split
+>> splitNum - how many splits
+<< return - teh transformed tensor by splitting it
+*/
+XTensor Split(const XTensor &s, int whereToSplit, int splitNum)
+{
+    CheckNTErrors(&s, "Invalid tensors!");
+
+    int order = s.order + 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < s.order; i++) {
+        if (i == whereToSplit)
+            dimSize[i] = s.dimSize[i] / splitNum;
+        else
+            dimSize[i] = s.dimSize[i];
+    }
+    dimSize[-1] = splitNum;
+
+    XTensor t = NewTensor(order, dimSize, s.dataType, s.denseRatio, s.devID, s.mem);
+    t.SetZeroAll();
+    t.SetTMP();
+
+    /* call _Split function */
+    _Split(&s, &t, whereToSplit, splitNum);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return t;
+}
+
+/*
+split a big tensor into small tensors.
+
 >> big - the source tensor
 >> smalls - the list that keeps the resulting tensors (for return)
-NOTE that all the "small" tensors have already been
-placed in the list in advance.
+   NOTE that all the "small" tensors have already been placed in the list in advance.
 >> whereToSplit - which dimension of the tensor is to split
 >> splitNum - how many splits
 */
-void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum)
+void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
 {
    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count == splitNum), "Unmatched tensors!");
@@ -211,7 +249,7 @@ void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum)

        tensorTMP->data = dataTMP;

-        Split(big, tensorTMP, whereToSplit, splitNum);
+        _Split(big, tensorTMP, whereToSplit, splitNum);

        /* copy from tmp to target */
        if (!uniform) {
@@ -234,4 +272,45 @@ void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum)
            XMemFree(mem->devID, dataTMP);
    }
 }
+
+/*
+split a big tensor into small tensors (returna a XList struture).
+make a new list to keep the result and return it.
+
+>> big - the source tensor
+>> whereToSplit - which dimension of the tensor is to split
+>> splitNum - how many splits
+<< return - a list of small tensors by splitting a big tensor
+*/
+XList SplitList(const XTensor &big, int whereToSplit, int splitNum)
+{
+    CheckNTErrors(&big, "Invalid tensors!");
+    
+    XList smalls = XList(splitNum);
+    int order = big.order;
+    int * dimSize = new int[order];
+    
+    for (int i = 0; i < big.order; i++) {
+        if (i != whereToSplit)
+            dimSize[i] = big.dimSize[i];
+        else
+            dimSize[i] = big.dimSize[i] / splitNum;
+    }
+
+    for (int i = 0; i < splitNum; i++) {
+        XTensor tensor = NewTensor(order, dimSize, big.dataType, big.denseRatio, big.devID, big.mem);
+        tensor.SetZeroAll();
+        tensor.SetTMP();
+        smalls.Add(&tensor);
+    }
+
+    /* call _Split function */
+    _Split(&big, &smalls, whereToSplit, splitNum);
+    
+    /* destroy variables */
+    delete dimSize;
+
+    return smalls;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -26,12 +26,27 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
-extern "C"
-void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
+/* 
+transform a tensor by splitting it 
+e.g., (M, N) -> (M, N/3, 3) 
+*/
+void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum);
+
+/* 
+transform a tensor by splitting it (return a XTensor structure)
+make a new tensor to keep the result and return it
+e.g., (M, N) -> (M, N/3, 3) 
+*/
+XTensor Split(const XTensor &s, int whereToSplit, int splitNum);

 /* split a big tensor into small tensors */
-void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum);
+void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum);
+
+/* 
+split a big tensor into small tensors (return a XList structure)
+make a new list to keep the result and return it
+*/
+XList SplitList(const XTensor &big, int whereToSplit, int splitNum);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Transpose.h
+++ b/source/tensor/core/shape/Transpose.h
@@ -29,17 +29,25 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #define transpose _Transpose_

-/* generate a transposed 1D/2D tensor: b = transposed(a) */
-void Transpose(XTensor * a, XTensor * b);
-
-/* transpose a 1D/2D tensor on site: a = transposed(a) */
-void Transpose_(XTensor * a);
+/*
+generate a transposed 1D/2D tensor
+b = transposed(a) 
+*/
+void _Transpose(XTensor * a, XTensor * b);

-/* make a transposed 1D/2D tensor: b = transposed(a) and return its pointer */
-XTensor * _Transpose(XTensor * a);
+/* 
+transpose a 1D/2D tensor (do it on site).
+keep the result in the input tensor and return nothing.
+a = transposed(a) 
+*/
+void _TransposeMe(XTensor * a);

-/* make a transposed 1D/2D tensor: b = transposed(a) and return its body */
-XTensor & _Transpose_(XTensor & a);
+/* 
+make a transposed 1D/2D tensor (return a XTensor structure).
+make a new tensor to keep the result and return it.
+b = transposed(a)
+*/
+XTensor Transpose(XTensor &a);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Unsqueeze.cpp
+++ b/source/tensor/core/shape/Unsqueeze.cpp
@@ -28,13 +28,15 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
+insert a dimension by copying the blocks for x times
+(where x is the size of the inerted dimension)
+
 >> a - input tensor
 >> b - output tensor
 >> dim - where to insert the dimension
 >> dSize - size of the newly-inserted dimension
 */
-void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize)
+void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
 {
    CheckNTErrors((a && b), "Empty input tensors!");
    CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
@@ -70,7 +72,7 @@ void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize)

    if (a->devID >= 0 || b->devID >= 0) {
 #ifdef USE_CUDA
-        CudaUnsqueeze(a, b, dim, dSize);
+        _CudaUnsqueeze(a, b, dim, dSize);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif
@@ -87,11 +89,50 @@ void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize)
            }
        }

-        MergeBlockLists(sourceArrays, blockSizes, 1, b->data, b->mem);
+        _MergeBlockLists(sourceArrays, blockSizes, 1, b->data, b->mem);

        delete sourceArrays;
        delete[] blockSizes;
    }
 }

+/*
+insert a dimension by copying the blocks for x times
+(where x is the size of the inerted dimension) (returna a XTensor structure)
+make a new tensor to keep the result and return it
+
+>> a - input tensor
+>> dim - where to insert the dimension
+>> dSize - size of the newly-inserted dimension
+<< return - a tensor by inserting a dimension by copying the blocks for x times
+*/
+XTensor Unsqueeze(const XTensor &a, int dim, int dSize)
+{
+    CheckNTErrors(&a, "Empty input tensors!");
+
+    int order = a.order + 1;
+    int * dimSize = new int[order];
+
+    for (int i = 0; i < order; i++) {
+        if (i < dim)
+            dimSize[i] = a.dimSize[i];
+        else if (i == dim)
+            dimSize[i] = dSize;
+        else
+            dimSize[i] = a.dimSize[i - 1];
+    }
+
+    XTensor b = NewTensor(order, dimSize, a.dataType, a.denseRatio, a.devID, a.mem);
+    b.SetZeroAll();
+    b.SetTMP();
+
+    /* call _Unsqueeze function */
+    _Unsqueeze(&a, &b, dim, dSize);
+
+    /* destroy variables */
+    delete dimSize;
+
+    return b;
+}
+
 } // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -67,7 +67,7 @@ insert a dimension by copying the blocks for x times (where x is the size of the
 >> dSize - size of the newly-inserted dimension
 */
 extern "C"
-void CudaUnsqueeze(XTensor * a, XTensor * b, int dim, int dSize)
+void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
 {
    int blockSize = 1;
    int blockNumA = 1;

--- a/source/tensor/core/shape/Unsqueeze.cuh
+++ b/source/tensor/core/shape/Unsqueeze.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* duplicate the data along a given dimension */
 extern "C"
-void CudaUnsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
+void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);

 #endif // USE_CUDA


--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
@@ -26,10 +26,16 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-
 /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
 extern "C"
-void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
+void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
+
+/*
+insert a dimension by copying the blocks for x times 
+(where x is the size of the inerted dimension) (return a XTensor structure)
+make a new tensor to keep the result and return it
+*/
+XTensor Unsqueeze(const XTensor &a, int dim, int dSize);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -29,11 +29,12 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
 sort the tensor along a given dimension
+
 >> a - the tensor
 >> index - index of the items in the resulting tensor
 >> dim - the dimension along which the sorting is performed
 */
-void Sort(XTensor * a, XTensor * index, int dim)
+void _Sort(XTensor * a, XTensor * index, int dim)
 {
    CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
@@ -45,7 +46,7 @@ void Sort(XTensor * a, XTensor * index, int dim)

    if (a->devID >= 0) {
 #ifdef USE_CUDA
-        CudaSortBig(a, a, index, index, dim);
+        _CudaSortBig(a, a, index, index, dim);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/sort/Sort.cu
+++ b/source/tensor/core/sort/Sort.cu
@@ -210,7 +210,7 @@ sort the tensor along a given dimension
 >> dim - specified dimension
 >> k - top-k results are returned
 */
-void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k)
+void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k)
 {
    CheckNTErrors((a && b), "Empty input tensor!");
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");

--- a/source/tensor/core/sort/Sort.cuh
+++ b/source/tensor/core/sort/Sort.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* sort the tensor along a given dimension */
 extern "C"
-void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
+void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);

 #endif // USE_CUDA


--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* sort the data along a given dimension */
 extern "C"
-void Sort(XTensor * a, XTensor * index, int dim);
+void _Sort(XTensor * a, XTensor * index, int dim);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/sort/TopK.cpp
+++ b/source/tensor/core/sort/TopK.cpp
@@ -34,7 +34,7 @@ get the top-k items along a given dimension
 >> dim - the dimension along which the sorting is performed
 >> k - how many items returned after sorting
 */
-void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
+void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
 {
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!");
    CheckNTErrors((a->order == b->order), "Unmatched input tensors!");
@@ -55,7 +55,7 @@ void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)

    if (a->devID >= 0 || b->devID >= 0) {
 #ifdef USE_CUDA
-        CudaTopK(a, b, index, dim, k);
+        _CudaTopK(a, b, index, dim, k);
 #else
        ShowNTErrors("Plesae specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/sort/TopK.cu
+++ b/source/tensor/core/sort/TopK.cu
@@ -370,7 +370,7 @@ get the top-k items along a given dimension
 >> dim - the dimension along which the sorting is performed
 >> k - how many items returned after sorting
 */
-void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
+void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k)
 {
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched input tensors!");
    CheckNTErrors((a->order == b->order), "Unmatched input tensors!");
@@ -439,7 +439,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        /* make the index tensor */
        indexA->SetAscendingOrder(dim);

-        CudaSortBig(a, b, indexA, index, dim, k);
+        _CudaSortBig(a, b, indexA, index, dim, k);

        if (a->mem != NULL)
            a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));

--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
@@ -30,7 +30,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* get the top-k items along a given dimension */
 extern "C"
-void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
+void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);

 #endif // USE_CUDA


--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* get the top-k items along a given dimension */
 extern "C"
-void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
+void _TopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
@@ -33,11 +33,11 @@ y =  1    if x > 1
 >> x - input tensor
 >> y - result
 */
-void HardTanH(XTensor * x, XTensor * y)
+void _HardTanH(const XTensor * x, XTensor * y)
 {
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
-        CudaHardTanH(x, y);
+        _CudaHardTanH(x, y);
        return;
    }
 #endif

--- a/source/tensor/function/HardTanH.cu
+++ b/source/tensor/function/HardTanH.cu
@@ -60,7 +60,7 @@ y =  1    if x > 1
 >> x - input tensor
 >> y - output tensor
 */
-void CudaHardTanH(XTensor * x, XTensor * y)
+void _CudaHardTanH(const XTensor * x, XTensor * y)
 {
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){


--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
@@ -36,7 +36,7 @@ y =  1    if x > 1
    -1    if x < -1
 */
 extern "C"
-void CudaHardTanH(XTensor * input, XTensor * output);
+void _CudaHardTanH(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
 extern "C"

--- a/source/tensor/function/HardTanH.h
+++ b/source/tensor/function/HardTanH.h
@@ -34,7 +34,7 @@ y =  1    if x > 1
    -1    if x < -1
 */
 extern "C"
-void HardTanH(XTensor * x, XTensor * y);
+void _HardTanH(const XTensor * x, XTensor * y);

 /* de/dx */
 extern "C"

--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
@@ -30,9 +30,9 @@ identity function y = x
 >> x - input tensor
 >> y - result
 */
-void Identity(XTensor * x, XTensor * y)
+void _Identity(const XTensor * x, XTensor * y)
 {
-    CopyValues(x, y);
+    _CopyValues(x, y);
 }

 /* 
@@ -61,7 +61,7 @@ void IdentityBackward(XTensor * gold, XTensor * y, XTensor * x,
            LossBackward(dedy, gold, y, lossName);

        if(dedy->data != dedx->data)
-            CopyValues(dedy, dedx);
+            _CopyValues(dedy, dedx);
    }
    else
        ShowNTErrors("TODO!");

--- a/source/tensor/function/Identity.h
+++ b/source/tensor/function/Identity.h
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* identity function y = x */
 extern "C"
-void Identity(XTensor * x, XTensor * y);
+void _Identity(const XTensor * x, XTensor * y);

 /* de/dx */
 extern "C"

--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
@@ -35,7 +35,7 @@ log scale softmax y = log(e^x / \sum_{i} e^{x_i})
 >> y - result
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
+void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim)
 {
    int leadDimRDI = x->order - leadDim - 1;
    if (!x->isSparse && !y->isSparse &&
@@ -73,8 +73,8 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
        max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
        sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);

-        ReduceMax(x, max, leadDim);
-        ReduceSum(x, sum, leadDim, max, 1.0F, true);
+        _ReduceMax(x, max, leadDim);
+        _ReduceSum(x, sum, leadDim, max, 1.0F, true);

        if (x->devID >= 0) {
            int dims[2];

--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
@@ -36,7 +36,7 @@ log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version)
 >> y - result
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim)
+void _CudaLogSoftmax(const XTensor * x, XTensor * y, int leadDim)
 {
    ShowNTErrors("You should call LogSoftmax instead!");
 }

--- a/source/tensor/function/LogSoftmax.cuh
+++ b/source/tensor/function/LogSoftmax.cuh
@@ -29,10 +29,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-
 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
 extern "C"
-void CudaLogSotmax(XTensor * input, XTensor * output, int leadDim);
+void _CudaLogSoftmax(const XTensor * input, XTensor * output, int leadDim);

 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) (Cuda version) */
 extern "C"

--- a/source/tensor/function/LogSoftmax.h
+++ b/source/tensor/function/LogSoftmax.h
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* log scale softmax y = log(e^x / \sum_{i} e^{x_i}) */
 extern "C"
-void LogSoftmax(XTensor * x, XTensor * y, int leadDim);
+void _LogSoftmax(const XTensor * x, XTensor * y, int leadDim);

 /* de/dx */
 extern "C"

--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
@@ -77,7 +77,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    if(LFName == SQUAREDERROR){
        XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
        _Sum(gold, y, diff, -1.0F);
-        Power(diff, 2.0F);
+        _Power(diff, 2.0F);
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);

        int reduceTimes = diff->order;
@@ -87,7 +87,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
-            ReduceSum(diff, diffNew, reducePlace);
+            _ReduceSum(diff, diffNew, reducePlace);
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
@@ -109,10 +109,10 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    */
    if(LFName == CROSSENTROPY){
        XTensor * diff = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
-        CopyValues(y, diff);
-        Log(diff);
+        _CopyValues(y, diff);
+        _Log(diff);
        _Multiply(gold, diff, diff);
-        Negate(diff);
+        _Negate(diff);

        int reduceTimes = diff->order;
        for (int i = 0; i < reduceTimes; i++) {
@@ -121,7 +121,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
-            ReduceSum(diff, diffNew, reducePlace);
+            _ReduceSum(diff, diffNew, reducePlace);
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];
@@ -145,10 +145,10 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
    if(LFName == ONEHOTERROR){
        XTensor * diff = NewTensor(gold->order, gold->dimSize, gold->dataType, gold->denseRatio, gold->devID, gold->mem);
        XTensor * yOnehot = NewTensor(y->order, y->dimSize, y->dataType, y->denseRatio, y->devID, y->mem);
-        CopyValues(y, yOnehot);
+        _CopyValues(y, yOnehot);
        _Multiply(gold, y, yOnehot);
        _Sum(gold, yOnehot, diff, -1.0F);
-        Power(diff, 2.0F);
+        _Power(diff, 2.0F);
        _ScaleAndShiftMe(diff, 0.5F, 0.0F);

        int reduceTimes = diff->order;
@@ -158,7 +158,7 @@ DTYPE CudaLossCompute(XTensor * gold, XTensor * y, LOSS_FUNCTION_NAME LFName,
            memcpy(diffDimSize, diff->dimSize + 1, diffOrder * sizeof(int));
            XTensor * diffNew = NewTensor(diffOrder, diffDimSize, X_FLOAT, 1.0F, diff->devID, diff->mem);
            int reducePlace = diff->dimSize[0] == 1 ? 1 : 0;
-            ReduceSum(diff, diffNew, reducePlace);
+            _ReduceSum(diff, diffNew, reducePlace);
            if (diffNew->order == 1) {
                diffNew->order = 2;
                diffNew->dimSize[1] = diffNew->dimSize[0];

--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
@@ -29,11 +29,11 @@ rectify function y = max(0, x)
 >> input - input tensor
 >> output - result
 */
-void Rectify(XTensor * x, XTensor * y)
+void _Rectify(const XTensor * x, XTensor * y)
 {
 #ifdef USE_CUDA
    if(y->devID >= 0 || y->devID >= 0){
-        CudaRectify(x, y);
+        _CudaRectify(x, y);
        return;
    }
 #endif

--- a/source/tensor/function/Rectify.cu
+++ b/source/tensor/function/Rectify.cu
@@ -54,7 +54,7 @@ rectify function y = max(0, x)
 >> x - input tensor
 >> y - result
 */
-void CudaRectify(XTensor * x, XTensor * y)
+void _CudaRectify(const XTensor * x, XTensor * y)
 {
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){


--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
@@ -31,7 +31,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* rectify function y = max(0, x) (Cuda version) */
 extern "C"
-void CudaRectify(XTensor * input, XTensor * output);
+void _CudaRectify(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
 extern "C"

--- a/source/tensor/function/Rectify.h
+++ b/source/tensor/function/Rectify.h
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* rectify function y = max(0, x) */
 extern "C"
-void Rectify(XTensor * x, XTensor * y);
+void _Rectify(const XTensor * x, XTensor * y);

 /* de/dx */
 extern "C"

--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
@@ -30,11 +30,11 @@ sigmoid function y = 1/(1+exp(-x))
 >> x - input tensor
 >> y - result
 */
-void Sigmoid(XTensor * x, XTensor * y)
+void _Sigmoid(const XTensor * x, XTensor * y)
 {
 #ifdef USE_CUDA
    if(x->devID >= 0 || y->devID >= 0){
-        CudaSigmoid(x, y);
+        _CudaSigmoid(x, y);
        return;
    }
 #endif

--- a/source/tensor/function/Sigmoid.cu
+++ b/source/tensor/function/Sigmoid.cu
@@ -58,7 +58,7 @@ sigmoid function y = 1/(1+exp(-x)) (Cuda version)
 >> x - input vector
 >> y - result
 */
-void CudaSigmoid(XTensor * x, XTensor * y)
+void _CudaSigmoid(const XTensor * x, XTensor * y)
 {
    if(x->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE){


--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
@@ -31,7 +31,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* rectify function y = max(0, x) (Cuda version) */
 extern "C"
-void CudaSigmoid(XTensor * input, XTensor * output);
+void _CudaSigmoid(const XTensor * input, XTensor * output);

 /* de/dx (Cuda version) */
 extern "C"

--- a/source/tensor/function/Sigmoid.h
+++ b/source/tensor/function/Sigmoid.h
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* sigmoid function y = 1/(1+exp(-x)) */
 extern "C"
-void Sigmoid(XTensor * x, XTensor * y);
+void _Sigmoid(const XTensor * x, XTensor * y);

 /* de/dx */
 extern "C"

--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
@@ -34,7 +34,7 @@ softmax y = e^x / \sum_{i} e^{x_i}
 >> y - result
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void Softmax(XTensor * x, XTensor * y, int leadDim)
+void _Softmax(const XTensor * x, XTensor * y, int leadDim)
 {
    int leadDimRDI = x->order - leadDim - 1;
    if(!x->isSparse && !y->isSparse && x->dataType == y->dataType){
@@ -56,12 +56,12 @@ void Softmax(XTensor * x, XTensor * y, int leadDim)
        max->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, max->unitNum * max->unitSize) : XMemAlloc(max->devID, max->unitNum * max->unitSize);
        sum->data = mem != NULL ? (char*)mem->AllocBuf(mem->devID, sum->unitNum * sum->unitSize) : XMemAlloc(sum->devID, sum->unitNum * sum->unitSize);

-        ReduceMax(x, max, leadDim);
-        ReduceSum(x, sum, leadDim, max, 1.0F, true);
+        _ReduceMax(x, max, leadDim);
+        _ReduceSum(x, sum, leadDim, max, 1.0F, true);

        if(x->devID >= 0){
 #ifdef USE_CUDA
-            CudaSoftmaxSumMax(x, y, leadDim, sum, max);
+            _CudaSoftmaxSumMax(x, y, leadDim, sum, max);
 #else
            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
@@ -39,7 +39,7 @@ softmax y = e^x / \sum_{i} e^{x_i} (Cuda version)
 >> y - result
 >> leadDim - leading dimension (along which we perform reduction)
 */
-void CudaSoftmax(XTensor * x, XTensor * y, int leadDim)
+void _CudaSoftmax(const XTensor * x, XTensor * y, int leadDim)
 {
    ShowNTErrors("You should call Softmax instead!");
 }
@@ -163,7 +163,7 @@ softmax y = e^x / \sum_{i} e^{x_i} (Cuda version)
 >> sum - \sum_{i} e^{x_i}
 >> max - \max_{i} e^{x_i}
 */
-void CudaSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max)
+void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max)
 {
    CheckNTErrors((x->devID >= 0), "Forward computation of softmax must be run on GPUs.");
    CheckNTErrors((x->devID == y->devID), "Tensors used in softmax are not on the same GPU.");
@@ -289,10 +289,10 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,

            /* \beta = \sum_i (dE/dy_i * y_i) */
            _Multiply(dedy, y, ytmp, 0, 0);
-            ReduceSum(ytmp, beta, leadDim);
+            _ReduceSum(ytmp, beta, leadDim);

            /* ytmp = dE/dy_j - \beta */
-            Unsqueeze(beta, ytmp, leadDim, y->dimSize[leadDim]);
+            _Unsqueeze(beta, ytmp, leadDim, y->dimSize[leadDim]);
            _Sum(dedy, ytmp, ytmp, -1.0F);

            /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */

--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
@@ -31,11 +31,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"
-void CudaSotmax(XTensor * input, XTensor * output, int leadDim);
+void _CudaSotmax(const XTensor * input, XTensor * output, int leadDim);

 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"
-void CudaSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);
+void _CudaSoftmaxSumMax(const XTensor * x, XTensor * y, int leadDim, XTensor * sum, XTensor * max);

 /* de/dx (Cuda version) */
 extern "C"

--- a/source/tensor/function/Softmax.h
+++ b/source/tensor/function/Softmax.h
@@ -29,7 +29,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* softmax y = e^x / \sum_{i} e^{x_i} */
 extern "C"
-void Softmax(XTensor * x, XTensor * y, int leadDim);
+void _Softmax(const XTensor * x, XTensor * y, int leadDim);

 /* de/dx */
 extern "C"

--- a/source/tensor/test/TAbsolute.cpp
+++ b/source/tensor/test/TAbsolute.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#include "TAbsolute.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Absolute function.
+Set every entry to its absolute value.
+*/
+bool TestAbsolute1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
+	                      {0.5F, -4.0F},
+	                      {0.0F, 6.0F} };
+	DTYPE answer[3][2] = { {1.0F, 2.0F},
+	                       {0.5F, 4.0F},
+	                       {0.0F, 6.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+
+	/* call Absolute function */
+	_Absolute(a);
+
+	/* check results */
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call Absolute function */
+	_Absolute(aGPU);
+
+	/* check results */
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+    delete aGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Absolute Function */
+bool TestAbsolute()
+{
+	XPRINT(0, stdout, "[TEST Absolute] set every entry to its absolute value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestAbsolute1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#ifndef __TEST_ABSOLUTE_H__
+#define __TEST_ABSOLUTE_H__
+
+#include "../core/arithmetic/Absolute.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Absolute Function */
+extern "C"
+bool TestAbsolute();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_ABSOLUTE_H__
--- a/source/tensor/test/TConcatenate.cpp
+++ b/source/tensor/test/TConcatenate.cpp
@@ -87,7 +87,7 @@ bool TestConcatenate1()
    sList->Add(s2);

    /* call Concatenate function */
-    Concatenate(sList, t, 1);
+    _Concatenate(sList, t, 1);
    
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -114,7 +114,7 @@ bool TestConcatenate1()
 	sList->Add(sGPU2);

 	/* call Concatenate function */
-	Concatenate(sList, tGPU, 1);
+	_Concatenate(sList, tGPU, 1);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -212,7 +212,7 @@ bool TestConcatenate2()
    sList->Add(s2);

    /* call Concatenate function */
-    Concatenate(sList, t, 0);
+    _Concatenate(sList, t, 0);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -239,7 +239,7 @@ bool TestConcatenate2()
 	sList->Add(sGPU2);

 	/* call Concatenate function */
-	Concatenate(sList, tGPU, 0);
+	_Concatenate(sList, tGPU, 0);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -335,7 +335,7 @@ bool TestConcatenate3()
    sList->Add(s2);

    /* call Concatenate function */
-    Concatenate(sList, t, 1);
+    _Concatenate(sList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -362,7 +362,7 @@ bool TestConcatenate3()
 	sList->Add(sGPU2);

 	/* call Concatenate function */
-	Concatenate(sList, tGPU, 1);
+	_Concatenate(sList, tGPU, 1);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -451,7 +451,7 @@ bool TestConcatenate4()
    t->SetZeroAll();

    /* call Concatenate function */
-    Concatenate(s1, s2, t, 1);
+    _Concatenate(s1, s2, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -471,7 +471,7 @@ bool TestConcatenate4()
 	tGPU->SetZeroAll();

 	/* call Concatenate function */
-	Concatenate(sGPU1, sGPU2, tGPU, 1);
+	_Concatenate(sGPU1, sGPU2, tGPU, 1);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TConcatenateSolely.cpp
+++ b/source/tensor/test/TConcatenateSolely.cpp
@@ -88,7 +88,7 @@ bool TestConcatenateSolely1()
    sList->Add(s2);

 	/* call ConcatenateSolely function */
-    ConcatenateSolely(sList, t, 1);
+    _ConcatenateSolely(sList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -115,7 +115,7 @@ bool TestConcatenateSolely1()
 	sList->Add(sGPU2);

 	/* call ConcatenateSolely function */
-	ConcatenateSolely(sList, tGPU, 1);
+	_ConcatenateSolely(sList, tGPU, 1);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -213,7 +213,7 @@ bool TestConcatenateSolely2()
    sList->Add(s2);

    /* call ConcatenateSolely function */
-    ConcatenateSolely(sList, t, 0);
+    _ConcatenateSolely(sList, t, 0);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -240,7 +240,7 @@ bool TestConcatenateSolely2()
 	sList->Add(sGPU2);

 	/* call concatenatesolely function */
-	ConcatenateSolely(sList, tGPU, 0);
+	_ConcatenateSolely(sList, tGPU, 0);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -336,7 +336,7 @@ bool TestConcatenateSolely3()
    sList->Add(s2);

 	/* call ConcatenateSolely function */
-    ConcatenateSolely(sList, t, 1);
+    _ConcatenateSolely(sList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -363,7 +363,7 @@ bool TestConcatenateSolely3()
 	sList->Add(sGPU2);

 	/* call ConcatenateSolely function */
-	ConcatenateSolely(sList, tGPU, 1);
+	_ConcatenateSolely(sList, tGPU, 1);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TConvertDataType.cpp
+++ b/source/tensor/test/TConvertDataType.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#include "TConvertDataType.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test ConvertDataType function.
+In this case, the flaot32 data type is converted to int32 data type.
+
+*/
+bool TestConvertDataType1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {0.5F, 4.0F},
+	                      {5.0F, 6.0F} };
+	int answer[3][2] = { {1, 2},
+	                     {0, 4},
+                         {5, 6} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+	XTensor * b = NewTensor(aOrder, aDimSize, X_INT);
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	b->SetZeroAll();
+
+	/* call ConvertDataType function */
+	_ConvertDataType(a, b);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call ConvertDataType function */
+	_ConvertDataType(aGPU, bGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+    delete aGPU;
+    delete bGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/*
+case 2: test ConvertDataType function.
+In this case, the int32 data type is converted to float32 data type.
+
+*/
+bool TestConvertDataType2()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	int aData[3][2] = { {1, 2}, 
+	                    {0, 4},
+	                    {5, 6} };
+	DTYPE answer[3][2] = { {1.0F, 2.0F}, 
+	                       {0.0F, 4.0F},
+	                       {5.0F, 6.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize, X_INT);
+	XTensor * b = NewTensor(aOrder, aDimSize);
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+	b->SetZeroAll();
+
+	/* call ConvertDataType function */
+	_ConvertDataType(a, b);
+
+	/* check results */
+	cpuTest = b->CheckData(answer, aUnitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_INT, 1.0F, 0);
+	XTensor * bGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call ConvertDataType function */
+	_ConvertDataType(aGPU, bGPU);
+
+	/* check results */
+	gpuTest = bGPU->CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+	delete b;
+    delete aGPU;
+    delete bGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete b;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for ConvertDataType Function */
+bool TestConvertDataType()
+{
+	XPRINT(0, stdout, "[TEST ConvertDataType] convert data type \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestConvertDataType1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* case 2 test */
+	caseFlag = TestConvertDataType2();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 2 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 2 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#ifndef __TEST_CONVERTDATATYPE_H__
+#define __TEST_CONVERTDATATYPE_H__
+
+#include "../core/getandset/ConvertDataType.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for ConvertDataType Function */
+extern "C"
+bool TestConvertDataType();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_CONVERTDATATYPE_H__
--- a/source/tensor/test/TCopyIndexed.cpp
+++ b/source/tensor/test/TCopyIndexed.cpp
@@ -71,23 +71,24 @@ bool TestCopyIndexed1()
    int tgtIndex[2] = {0, 1};
    int copyNum = 1;

-
    /* CPU test */
    bool cpuTest = true;

    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -96,16 +97,18 @@ bool TestCopyIndexed1()
    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;
@@ -181,16 +184,18 @@ bool TestCopyIndexed2()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
    
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -199,16 +204,18 @@ bool TestCopyIndexed2()
    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;
@@ -284,16 +291,18 @@ bool TestCopyIndexed3()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUser = CopyIndexed(*s, dim, srcIndex, indexSize, tgtIndex, copyNum);
    
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -302,16 +311,18 @@ bool TestCopyIndexed3()
    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

    /* call CopyIndexed function */
-    CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    _CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    tUserGPU = CopyIndexed(*sGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/TCopyValues.cpp
+++ b/source/tensor/test/TCopyValues.cpp
@@ -49,16 +49,19 @@ bool TestCopyValues1()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(sOrder, sDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

    /* call CopyValues function */
-    CopyValues(s, t);
+    _CopyValues(s, t);
+    tUser = CopyValues(*s);

    /* check results */
-    cpuTest = t->CheckData(s->data, sUnitNum);
+    cpuTest = t->CheckData(sData, sUnitNum) && tUser.CheckData(sData, sUnitNum);
+
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
@@ -66,21 +69,18 @@ bool TestCopyValues1()
    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetData(sData, sUnitNum);

    /* call CopyValues function */
-    CopyValues(sGPU, tGPU);
+    _CopyValues(sGPU, tGPU);
+    tUserGPU = CopyValues(*sGPU);

    /* check results */
-    DTYPE * dataGPU = (DTYPE*)sGPU->data;
-    int size = sUnitNum * sGPU->unitSize;
-    char * dataCPU = new char[size];
-    XMemCopy(dataCPU, -1, dataGPU, sGPU->devID, size);
-
-    gpuTest = tGPU->CheckData(dataCPU, sUnitNum);
+    gpuTest = tGPU->CheckData(sData, sUnitNum) && tUser.CheckData(sData, sUnitNum);

    /* destroy variables */
    delete s;

--- a/source/tensor/test/THardTanH.cpp
+++ b/source/tensor/test/THardTanH.cpp
@@ -59,7 +59,7 @@ bool TestHardTanH1()
 	y->SetZeroAll();

 	/* call hardtanh function */
-	HardTanH(x, y);
+	_HardTanH(x, y);

 	/* check results */
 	cpuTest = y->CheckData(answer, unitNum, 1e-4F);
@@ -77,7 +77,7 @@ bool TestHardTanH1()
 	yGPU->SetZeroAll();

 	/* call hardtanh function */
-	HardTanH(xGPU, yGPU);
+	_HardTanH(xGPU, yGPU);

 	/* check results */
 	gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
@@ -152,7 +152,7 @@ bool TestHardTanH2()
 	dedx->SetZeroAll();

    /* call HardTanH function */
-    HardTanH(x, y);
+    _HardTanH(x, y);

 	/* call HardTanHBackward function */
 	HardTanHBackward(gold, y, x, dedy, dedx, SQUAREDERROR);
@@ -181,7 +181,7 @@ bool TestHardTanH2()
 	dedxGPU->SetZeroAll();

    /* call HardTanH function */
-    HardTanH(xGPU, yGPU);
+    _HardTanH(xGPU, yGPU);

 	/* call hardtanhbackward function */
 	HardTanHBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, SQUAREDERROR);

--- a/source/tensor/test/TIdentity.cpp
+++ b/source/tensor/test/TIdentity.cpp
@@ -57,7 +57,7 @@ bool TestIdentity1()
    y->SetZeroAll();

    /* call Identity function */
-    Identity(x, y);
+    _Identity(x, y);
    
    /* check result */
    cpuTest = y->CheckData(answer, unitNum);
@@ -75,7 +75,7 @@ bool TestIdentity1()
    yGPU->SetZeroAll();

    /* call Identity function */
-    Identity(xGPU, yGPU);
+    _Identity(xGPU, yGPU);
    
    /* check result */
    gpuTest = yGPU->CheckData(answer, unitNum);
@@ -139,7 +139,7 @@ bool TestIdentity2()
    dedy->SetZeroAll();

    /* call Identity function */
-    Identity(x, y);
+    _Identity(x, y);

    /* call IdentityBackward function */
    IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
@@ -168,7 +168,7 @@ bool TestIdentity2()
    dedyGPU->SetZeroAll();

    /* call Identity function */
-    Identity(xGPU, yGPU);
+    _Identity(xGPU, yGPU);

    /* call IdentityBackward function */
    IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);

--- a/source/tensor/test/TLog.cpp
+++ b/source/tensor/test/TLog.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#include "TLog.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Log function.
+Set every entry to its log value.
+*/
+bool TestLog1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, 2.0F}, 
+	                      {0.5F, 4.0F},
+	                      {5.0F, 6.0F} };
+	DTYPE answer[3][2] = { {0.0F, 0.6931F},
+	                       {-0.6931F, 1.3863F},
+	                       {1.6094F, 1.7918F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+
+	/* call Log function */
+	_Log(a);
+
+	/* check results */
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call Log function */
+	_Log(aGPU);
+
+	/* check results */
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
+
+	/* destroy variables */
+	delete a;
+    delete aGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Log Function */
+bool TestLog()
+{
+	XPRINT(0, stdout, "[TEST Log] set every entry to its log value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestLog1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#ifndef __TEST_LOG_H__
+#define __TEST_LOG_H__
+
+#include "../core/math/Log.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Log Function */
+extern "C"
+bool TestLog();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_LOG_H__
--- a/source/tensor/test/TLogSoftmax.cpp
+++ b/source/tensor/test/TLogSoftmax.cpp
@@ -57,7 +57,7 @@ bool TestLogSoftmax1()
    y->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(x, y, 1);
+    _LogSoftmax(x, y, 1);
    
    /* check result */
    cpuTest = y->CheckData(answer, unitNum, 1e-4F);
@@ -75,7 +75,7 @@ bool TestLogSoftmax1()
    yGPU->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(xGPU, yGPU, 1);
+    _LogSoftmax(xGPU, yGPU, 1);
    
    /* check result */
    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
@@ -139,7 +139,7 @@ bool TestLogSoftmax2()
    dedy->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(x, y, 1);
+    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
@@ -167,7 +167,7 @@ bool TestLogSoftmax2()
    dedyGPU->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(xGPU, yGPU, 1);
+    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
@@ -243,7 +243,7 @@ bool TestLogSoftmax3()
    dedy->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(x, y, 1);
+    _LogSoftmax(x, y, 1);
    
    /* call LogSoftmaxBackward function */
    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, SQUAREDERROR);
@@ -271,7 +271,7 @@ bool TestLogSoftmax3()
    dedyGPU->SetZeroAll();

    /* call LogSoftmax function */
-    LogSoftmax(xGPU, yGPU, 1);
+    _LogSoftmax(xGPU, yGPU, 1);

    /* call LogSoftmaxBackward function */
    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, SQUAREDERROR);

--- a/source/tensor/test/TMatrixMULBatchedCPU.cpp
+++ b/source/tensor/test/TMatrixMULBatchedCPU.cpp
@@ -107,7 +107,7 @@ bool TestMatrixMulBatchedCPU1()
    cList->Add(c2);

    /* call MatrixMULBatchedCPU function */
-    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
+    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);

    /* check results */
    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
@@ -146,7 +146,7 @@ bool TestMatrixMulBatchedCPU1()
    cList->Add(cGPU2);

    /* call MatrixMULBatchedCPU function */
-    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
+    _MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);

    /* check results */
    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;

--- a/source/tensor/test/TMatrixMul.cpp
+++ b/source/tensor/test/TMatrixMul.cpp
@@ -82,7 +82,7 @@ bool TestMatrixMul1()
    t->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
    
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -102,7 +102,7 @@ bool TestMatrixMul1()
    tGPU->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -192,7 +192,7 @@ bool TestMatrixMul2()
    t->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(s1, X_TRANS, s2, X_NOTRANS, t);
+    _MatrixMul(s1, X_TRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -212,7 +212,7 @@ bool TestMatrixMul2()
    tGPU->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(sGPU1, X_TRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul(sGPU1, X_TRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -322,7 +322,7 @@ bool TestMatrixMul3()
    t->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -342,7 +342,7 @@ bool TestMatrixMul3()
    tGPU->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -441,7 +441,7 @@ bool TestMatrixMul4()
    t->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -461,7 +461,7 @@ bool TestMatrixMul4()
    tGPU->SetZeroAll();

    /* call MatrixMul function */
-    MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
@@ -82,7 +82,7 @@ bool TestMatrixMul2D1()
    t->SetZeroAll();

    /* call MatrixMul2D function */
-    MatrixMul2D(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMul2D(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -102,7 +102,7 @@ bool TestMatrixMul2D1()
    tGPU->SetZeroAll();

    /* call MatrixMul2D function */
-    MatrixMul2D(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul2D(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -192,7 +192,7 @@ bool TestMatrixMul2D2()
    t->SetZeroAll();

    /* call MatrixMul2D function */
-    MatrixMul2D(s1, X_TRANS, s2, X_NOTRANS, t);
+    _MatrixMul2D(s1, X_TRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -212,7 +212,7 @@ bool TestMatrixMul2D2()
    tGPU->SetZeroAll();

    /* call MatrixMul2D function */
-    MatrixMul2D(sGPU1, X_TRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMul2D(sGPU1, X_TRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TMatrixMul2DParallel.cpp
+++ b/source/tensor/test/TMatrixMul2DParallel.cpp
@@ -82,7 +82,7 @@ bool TestMatrixMul2DParallel1()
    t->SetZeroAll();

    /* call MatrixMul2DParallel function */
-    MatrixMul2DParallel(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMul2DParallel(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -158,7 +158,7 @@ bool TestMatrixMul2DParallel2()
    t->SetZeroAll();

    /* call MatrixMul2DParallel function */
-    MatrixMul2DParallel(s1, X_TRANS, s2, X_NOTRANS, t);
+    _MatrixMul2DParallel(s1, X_TRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TMatrixMulBatched.cpp
+++ b/source/tensor/test/TMatrixMulBatched.cpp
@@ -82,7 +82,7 @@ bool TestMatrixMulBatched1()
    t->SetZeroAll();

    /* call MatrixMulBatched function */
-    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
    
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -102,7 +102,7 @@ bool TestMatrixMulBatched1()
    tGPU->SetZeroAll();
    
    /* call MatrixMulBatched function */
-    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
    
    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -200,7 +200,7 @@ bool TestMatrixMulBatched2()
    t->SetZeroAll();

    /* call MatrixMulBatched function */
-    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    _MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -220,7 +220,7 @@ bool TestMatrixMulBatched2()
    tGPU->SetZeroAll();

    /* call MatrixMulBatched function */
-    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    _MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TMerge.cpp
+++ b/source/tensor/test/TMerge.cpp
@@ -66,7 +66,7 @@ bool TestMerge1()
    t->SetZeroAll();

    /* call merge function */
-    Merge(s, t, 1, 0);
+    _Merge(s, t, 1, 0);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -84,7 +84,7 @@ bool TestMerge1()
    tGPU->SetZeroAll();

    /* call merge function */
-    Merge(sGPU, tGPU, 1, 0);
+    _Merge(sGPU, tGPU, 1, 0);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -173,8 +173,8 @@ bool TestMerge2()
    t2->SetZeroAll();

    /* call merge function */
-    Merge(s, t1, 1, 0);
-    Merge(s, t2, 2, 0);
+    _Merge(s, t1, 1, 0);
+    _Merge(s, t2, 2, 0);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -194,8 +194,8 @@ bool TestMerge2()
    tGPU2->SetZeroAll();

    /* call merge function */
-    Merge(sGPU, tGPU1, 1, 0);
-    Merge(sGPU, tGPU2, 2, 0);
+    _Merge(sGPU, tGPU1, 1, 0);
+    _Merge(sGPU, tGPU2, 2, 0);

    /* check results */
    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
@@ -282,7 +282,7 @@ bool TestMerge3()
    smallList->Add(s2);

    /* call merge function */
-    Merge(smallList, t, 0);
+    _Merge(smallList, t, 0);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -309,7 +309,7 @@ bool TestMerge3()
    smallList->Add(sGPU2);

    /* call merge function */
-    Merge(smallList, tGPU, 0);
+    _Merge(smallList, tGPU, 0);

    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -394,7 +394,7 @@ bool TestMerge4()
    smallList->Add(s2);

    /* call merge function */
-    Merge(smallList, t, 1);
+    _Merge(smallList, t, 1);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -421,7 +421,7 @@ bool TestMerge4()
    smallList->Add(sGPU2);

    /* call merge function */
-    Merge(smallList, tGPU, 1);
+    _Merge(smallList, tGPU, 1);

    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TMultiply.cpp
+++ b/source/tensor/test/TMultiply.cpp
@@ -74,17 +74,23 @@ bool TestMultiply1()
 	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
 	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
 	XTensor * t = NewTensor(tOrder, tDimSize);
+	XTensor * tMe = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

 	/* initialize variables */
 	s1->SetData(sData1, sUnitNum1);
+	tMe->SetData(sData1, sUnitNum1);
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();

-	/* call MultiplyElementWise function */
+	/* call Multiply function */
 	_Multiply(s1, s2, t, 0, 0);
+	_MultiplyMe(tMe, s2, 0, 0);
+    tUser = Multiply(*s1, *s2, 0, 0);

 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum);
+	cpuTest = t->CheckData(answer, tUnitNum) 
+        && tMe->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -94,25 +100,33 @@ bool TestMultiply1()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
+	tMeGPU->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();

-	/* call MultiplyElementWise function */
+	/* call Multiply function */
 	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
+	_MultiplyMe(tMeGPU, sGPU2, 0, 0);
+    tUserGPU = Multiply(*sGPU1, *sGPU2, 0, 0);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum)
+        && tMeGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

 	/* destroy variables */
    delete s1;
    delete s2;
    delete t;
+    delete tMe;
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
+    delete tMeGPU;
    delete[] sDimSize1;
    delete[] sDimSize2;
    delete[] tDimSize;
@@ -123,6 +137,7 @@ bool TestMultiply1()
    delete s1;
    delete s2;
    delete t;
+    delete tMe;
    delete[] sDimSize1;
    delete[] sDimSize2;
    delete[] tDimSize;
@@ -182,17 +197,23 @@ bool TestMultiply2()
 	XTensor * s1 = NewTensor(sOrder1, sDimSize1);
 	XTensor * s2 = NewTensor(sOrder2, sDimSize2);
 	XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * tMe = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

 	/* initialize variables */
 	s1->SetData(sData1, sUnitNum1);
+	tMe->SetData(sData1, sUnitNum1);
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();

-	/* call MultiplyElementWise function */
+	/* call Multiply function */
 	_Multiply(s1, s2, t, 0, 0);
+	_MultiplyMe(tMe, s2, 0, 0);
+    tUser = Multiply(*s1, *s2, 0, 0);

 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum);
+	cpuTest = t->CheckData(answer, tUnitNum) 
+        && tMe->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -202,25 +223,33 @@ bool TestMultiply2()
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tMeGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* Initialize variables */
 	sGPU1->SetData(sData1, sUnitNum1);
+	tMeGPU->SetData(sData1, sUnitNum1);
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();

-	/* call MultiplyElementWise function */
+	/* call Multiply function */
 	_Multiply(sGPU1, sGPU2, tGPU, 0, 0);
+	_MultiplyMe(tMeGPU, sGPU2, 0, 0);
+    tUserGPU = Multiply(*sGPU1, *sGPU2, 0, 0);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum)
+        && tMeGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);

 	/* destroy variables */
    delete s1;
    delete s2;
    delete t;
+    delete tMe;
    delete sGPU1;
    delete sGPU2;
    delete tGPU;
+    delete tMeGPU;
    delete[] sDimSize1;
    delete[] sDimSize2;
    delete[] tDimSize;
@@ -231,6 +260,7 @@ bool TestMultiply2()
    delete s1;
    delete s2;
    delete t;
+    delete tMe;
    delete[] sDimSize1;
    delete[] sDimSize2;
    delete[] tDimSize;
@@ -351,10 +381,10 @@ bool TestMultiply3()
 TODO!!
 */

-/* test for MultiplyElementWise Function */
+/* test for Multiply Function */
 bool TestMultiply()
 {
-	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
+	XPRINT(0, stdout, "[TEST Multiply] element-wise product of two tensors \n");
 	bool returnFlag = true, caseFlag = true;

 	/* case 1 test */

--- a/source/tensor/test/TNegate.cpp
+++ b/source/tensor/test/TNegate.cpp
@@ -53,7 +53,7 @@ bool TestNegate1()
 	a->SetData(aData, aUnitNum);

 	/* call Negate function */
-	Negate(a);
+	_Negate(a);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum);
@@ -69,7 +69,7 @@ bool TestNegate1()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Negate function */
-	Negate(aGPU);
+	_Negate(aGPU);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);
@@ -119,7 +119,7 @@ bool TestNegate2()
 	a->SetData(aData, aUnitNum);

 	/* call Negate function */
-	Negate(a);
+	_Negate(a);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum);
@@ -135,7 +135,7 @@ bool TestNegate2()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Negate function */
-	Negate(aGPU);
+	_Negate(aGPU);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);

--- a/source/tensor/test/TNormalize.cpp
+++ b/source/tensor/test/TNormalize.cpp
@@ -108,9 +108,12 @@ bool TestNormalize1()
 	XTensor * var = NewTensor(varOrder, varDimSize);
 	XTensor * a = NewTensor(aOrder, aDimSize);
 	XTensor * b = NewTensor(bOrder, bDimSize);
+	XTensor * tMe = NewTensor(sOrder, sDimSize);
+    XTensor tUser;

 	/* initialize variables */
 	s->SetData(sData, sUnitNum);
+	tMe->SetData(sData, sUnitNum);
 	mean->SetData(meanData, meanUnitNum);
 	var->SetData(varData, varUnitNum);
 	a->SetData(aData, aUnitNum);
@@ -118,10 +121,13 @@ bool TestNormalize1()
 	t->SetZeroAll();

 	/* call normalize function */
-	Normalize(s, t, 0, mean, var, a, b, 0.0F);
+	_Normalize(s, t, 0, mean, var, a, b, 0.0F);
+	_NormalizeMe(tMe, 0, mean, var, a, b, 0.0F);
+    tUser = Normalize(*s, 0, *mean, *var, *a, *b, 0.0F);
    
 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum, 1e-4F, 0);
+	cpuTest = t->CheckData(answer, tUnitNum, 1e-4F) 
+        && tMe->CheckData(answer, tUnitNum, 1e-4F) && tUser.CheckData(answer, tUnitNum, 1e-4F);

 #ifdef USE_CUDA
 	/* GPU test */
@@ -134,9 +140,12 @@ bool TestNormalize1()
 	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
 	XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
 	XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

 	/* initialize variables */
 	sGPU->SetData(sData, sUnitNum);
+	tMeGPU->SetData(sData, sUnitNum);
 	meanGPU->SetData(meanData, meanUnitNum);
 	varGPU->SetData(varData, varUnitNum);
 	aGPU->SetData(aData, aUnitNum);
@@ -144,19 +153,24 @@ bool TestNormalize1()
 	tGPU->SetZeroAll();

 	/* call Normalize function */
-	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
+	_Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
+	_NormalizeMe(tMeGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
+    tUserGPU = Normalize(*sGPU, 0, *meanGPU, *varGPU, *aGPU, *bGPU, 0.0F);

 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F, 0);
+	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4F) 
+        && tMeGPU->CheckData(answer, tUnitNum, 1e-4F) && tUserGPU.CheckData(answer, tUnitNum, 1e-4F);

 	/* destroy variables */
 	delete s;
+	delete tMe;
 	delete t;
 	delete mean;
 	delete var;
 	delete a;
 	delete b;
 	delete sGPU;
+	delete tMeGPU;
 	delete tGPU;
 	delete meanGPU;
 	delete varGPU;
@@ -173,6 +187,7 @@ bool TestNormalize1()
 #else
 	/* destroy variables */
 	delete s;
+	delete tMe;
 	delete t;
 	delete mean;
 	delete var;

--- a/source/tensor/test/TPower.cpp
+++ b/source/tensor/test/TPower.cpp
@@ -57,7 +57,7 @@ bool TestPower1()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 2.0F);
+	_Power(a, 2.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -73,7 +73,7 @@ bool TestPower1()
 	aGPU->SetData(aData, aUnitNum);

 	/* call power function */
-	Power(aGPU, 2.0F);
+	_Power(aGPU, 2.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
@@ -126,7 +126,7 @@ bool TestPower2()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 1.0F);
+	_Power(a, 1.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -142,7 +142,7 @@ bool TestPower2()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(aGPU, 1.0F);
+	_Power(aGPU, 1.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
@@ -195,7 +195,7 @@ bool TestPower3()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 0.0F);
+	_Power(a, 0.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -211,7 +211,7 @@ bool TestPower3()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(aGPU, 0.0F);
+	_Power(aGPU, 0.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);

--- a/source/tensor/test/TRectify.cpp
+++ b/source/tensor/test/TRectify.cpp
@@ -56,7 +56,7 @@ bool TestRectify1()
    y->SetZeroAll();

    /* call Rectify function */
-    Rectify(x, y);
+    _Rectify(x, y);

    /* check results */
    cpuTest = y->CheckData(answer, unitNum);
@@ -74,7 +74,7 @@ bool TestRectify1()
 	yGPU->SetZeroAll();

 	/* call Rectify function */
-	Rectify(xGPU, yGPU);
+	_Rectify(xGPU, yGPU);

 	/* check results */
 	gpuTest = yGPU->CheckData(answer, unitNum);
@@ -144,7 +144,7 @@ bool TestRectify2()
 	dedx->SetZeroAll();

    /* call Rectify function */
-    Rectify(x, y);
+    _Rectify(x, y);

 	/* call RectifyBackward function */
 	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
@@ -173,7 +173,7 @@ bool TestRectify2()
 	dedxGPU->SetZeroAll();
    
    /* call Rectify function */
-    Rectify(xGPU, yGPU);
+    _Rectify(xGPU, yGPU);

 	/* call rectifybackward function */
 	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);

--- a/source/tensor/test/TReduceMax.cpp
+++ b/source/tensor/test/TReduceMax.cpp
@@ -78,8 +78,8 @@ bool TestReduceMax1()
    t2->SetZeroAll();

    /* call ReduceMax function */
-    ReduceMax(s, t1, 0);
-    ReduceMax(s, t2, 1);
+    _ReduceMax(s, t1, 0);
+    _ReduceMax(s, t2, 1);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -99,8 +99,8 @@ bool TestReduceMax1()
    tGPU2->SetZeroAll();

    /* call ReduceMax function */
-    ReduceMax(sGPU, tGPU1, 0);
-    ReduceMax(sGPU, tGPU2, 1);
+    _ReduceMax(sGPU, tGPU1, 0);
+    _ReduceMax(sGPU, tGPU2, 1);

    /* check results */
    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

--- a/source/tensor/test/TReduceMean.cpp
+++ b/source/tensor/test/TReduceMean.cpp
@@ -73,8 +73,8 @@ bool TestReduceMean1()
    t2->SetZeroAll();

    /* call ReduceMean function */
-    ReduceMean(s, t1, 0);
-    ReduceMean(s, t2, 1);
+    _ReduceMean(s, t1, 0);
+    _ReduceMean(s, t2, 1);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -94,8 +94,8 @@ bool TestReduceMean1()
    tGPU2->SetZeroAll();

    /* call ReduceMean function */
-    ReduceMean(sGPU, tGPU1, 0);
-    ReduceMean(sGPU, tGPU2, 1);
+    _ReduceMean(sGPU, tGPU1, 0);
+    _ReduceMean(sGPU, tGPU2, 1);

    /* check results */
    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

--- a/source/tensor/test/TReduceSum.cpp
+++ b/source/tensor/test/TReduceSum.cpp
@@ -78,8 +78,8 @@ bool TestReduceSum1()
    t2->SetZeroAll();

    /* call ReduceSum function */
-    ReduceSum(s, t1, 0);
-    ReduceSum(s, t2, 1);
+    _ReduceSum(s, t1, 0);
+    _ReduceSum(s, t2, 1);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -99,8 +99,8 @@ bool TestReduceSum1()
    tGPU2->SetZeroAll();

    /* call ReduceSum function */
-    ReduceSum(sGPU, tGPU1, 0);
-    ReduceSum(sGPU, tGPU2, 1);
+    _ReduceSum(sGPU, tGPU1, 0);
+    _ReduceSum(sGPU, tGPU2, 1);

    /* check results */
    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

--- a/source/tensor/test/TReduceSumSquared.cpp
+++ b/source/tensor/test/TReduceSumSquared.cpp
@@ -77,7 +77,7 @@ bool TestReduceSumSquared1()
    t->SetZeroAll();

    /* call ReduceSumSquared function */
-    ReduceSumSquared(s, t, 0, shift);
+    _ReduceSumSquared(s, t, 0, shift);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -97,7 +97,7 @@ bool TestReduceSumSquared1()
    tGPU->SetZeroAll();

    /* call ReduceSumSquared function */
-    ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
+    _ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -181,7 +181,7 @@ bool TestReduceSumSquared2()
    t->SetZeroAll();

    /* call ReduceSumSquared function */
-    ReduceSumSquared(s, t, 1, shift);
+    _ReduceSumSquared(s, t, 1, shift);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -201,7 +201,7 @@ bool TestReduceSumSquared2()
    tGPU->SetZeroAll();

    /* call ReduceSumSquared function */
-    ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);
+    _ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TReduceVariance.cpp
+++ b/source/tensor/test/TReduceVariance.cpp
@@ -77,7 +77,7 @@ bool TestReduceVariance1()
    t->SetZeroAll();

    /* call ReduceVariance function */
-    ReduceVariance(s, t, 0, mean);
+    _ReduceVariance(s, t, 0, mean);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -97,7 +97,7 @@ bool TestReduceVariance1()
    tGPU->SetZeroAll();

    /* call ReduceVariance function */
-    ReduceVariance(sGPU, tGPU, 0, meanGPU);
+    _ReduceVariance(sGPU, tGPU, 0, meanGPU);

    /* check results */
    gpuTest = t->CheckData(answer, tUnitNum);

--- a/source/tensor/test/TScaleAndShift.cpp
+++ b/source/tensor/test/TScaleAndShift.cpp
@@ -52,15 +52,22 @@ bool TestScaleAndShift1()

    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(sOrder, sDimSize);
+    XTensor * tMe = NewTensor(sOrder, sDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
+    tMe->SetData(sData, sUnitNum);

    /* call ScaleAndShift function */
-    _ScaleAndShift(s, s, scaleFactor, shiftFactor);
+    _ScaleAndShift(s, t, scaleFactor, shiftFactor);
+    _ScaleAndShiftMe(tMe, scaleFactor, shiftFactor);
+    tUser = ScaleAndShift(*s, scaleFactor, shiftFactor);

    /* check results */
-    cpuTest = s->CheckData(answer, sUnitNum);
+    cpuTest = t->CheckData(answer, sUnitNum) && 
+        tMe->CheckData(answer, sUnitNum) && tUser.CheckData(answer, sUnitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -68,25 +75,38 @@ bool TestScaleAndShift1()

    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tMeGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
+    tMeGPU->SetData(sData, sUnitNum);

    /* call ScaleAndShift function */
-    _ScaleAndShift(sGPU, sGPU, scaleFactor, shiftFactor);
+    _ScaleAndShift(sGPU, tGPU, scaleFactor, shiftFactor);
+    _ScaleAndShiftMe(tMeGPU, scaleFactor, shiftFactor);
+    tUserGPU = ScaleAndShift(*sGPU, scaleFactor, shiftFactor);

    /* check results */
-    gpuTest = sGPU->CheckData(answer, sUnitNum);
+    gpuTest = tGPU->CheckData(answer, sUnitNum) && 
+        tMeGPU->CheckData(answer, sUnitNum) && tUserGPU.CheckData(answer, sUnitNum);

    /* destroy variables */
    delete s;
+    delete t;
+    delete tMe;
    delete sGPU;
+    delete tGPU;
+    delete tMeGPU;
    delete[] sDimSize;

    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete s;
+    delete t;
+    delete tMe;
    delete[] sDimSize;

    return cpuTest;

--- a/source/tensor/test/TSelect.cpp
+++ b/source/tensor/test/TSelect.cpp
@@ -67,16 +67,18 @@ bool TestSelect1()
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor tUser;

    /* initialize variables */
    s->SetData(sData, sUnitNum);
    t->SetZeroAll();

    /* call SelectRange function */
-    SelectRange(s, t, 2, 1, 3);
+    _SelectRange(s, t, 2, 1, 3);
+    tUser = SelectRange(*s, 2, 1, 3);

    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t->CheckData(answer, tUnitNum) && tUser.CheckData(answer, tUnitNum);
    
 #ifdef USE_CUDA
    /* GPU test */
@@ -85,16 +87,18 @@ bool TestSelect1()
    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor tUserGPU;

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

-    /* call Select function */
-    SelectRange(sGPU, tGPU, 2, 1, 3);
+    /* call SelectRange function */
+    _SelectRange(sGPU, tGPU, 2, 1, 3);
+    tUserGPU = SelectRange(*sGPU, 2, 1, 3);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU->CheckData(answer, tUnitNum) && tUserGPU.CheckData(answer, tUnitNum);
    
    /* destroy variables */
    delete s;

--- a/source/tensor/test/TSigmoid.cpp
+++ b/source/tensor/test/TSigmoid.cpp
@@ -54,7 +54,7 @@ bool TestSigmoid1()
    y->SetZeroAll();

    /* call Sigmoid function */
-    Sigmoid(x, y);
+    _Sigmoid(x, y);

    /* check result */
    cpuTest = y->CheckData(answer, unitNum, 1e-4F);
@@ -72,7 +72,7 @@ bool TestSigmoid1()
    yGPU->SetZeroAll();

    /* call Sigmoid function */
-    Sigmoid(xGPU, yGPU);
+    _Sigmoid(xGPU, yGPU);

    /* check result */
    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
@@ -138,7 +138,7 @@ bool TestSigmoid2()
    dedx->SetZeroAll();

    /* call Sigmoid function */
-    Sigmoid(x, y);
+    _Sigmoid(x, y);

    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
@@ -167,7 +167,7 @@ bool TestSigmoid2()
    dedxGPU->SetZeroAll();

    /* call Sigmoid function */
-    Sigmoid(xGPU, yGPU);
+    _Sigmoid(xGPU, yGPU);

    /* call SigmoidBackward function */
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);

--- a/source/tensor/test/TSign.cpp
+++ b/source/tensor/test/TSign.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#include "TSign.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/*
+case 1: test Sign function.
+Set every entry to its sign value.
+*/
+bool TestSign1()
+{
+	/* a tensor of size (3, 2) */
+	int aOrder = 2;
+	int * aDimSize = new int[aOrder];
+	aDimSize[0] = 3;
+	aDimSize[1] = 2;
+
+	int aUnitNum = 1;
+	for (int i = 0; i < aOrder; i++)
+		aUnitNum *= aDimSize[i];
+
+	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
+	                      {0.0F, 4.0F},
+	                      {5.0F, -6.0F} };
+	DTYPE answer[3][2] = { {1.0F, -1.0F},
+	                       {0.0F, 1.0F},
+	                       {1.0F, -1.0F} };
+
+	/* CPU test */
+	bool cpuTest = true;
+
+	/* create tensors */
+	XTensor * a = NewTensor(aOrder, aDimSize);
+
+	/* initialize variables */
+	a->SetData(aData, aUnitNum);
+
+	/* call Sign function */
+	_Sign(a);
+
+	/* check results */
+	cpuTest = a->CheckData(answer, aUnitNum);
+    
+#ifdef USE_CUDA
+	/* GPU test */
+	bool gpuTest = true;
+
+	/* create tensor */
+	XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+
+	/* Initialize variables */
+	aGPU->SetData(aData, aUnitNum);
+
+	/* call Sign function */
+	_Sign(aGPU);
+
+	/* check results */
+	gpuTest = aGPU->CheckData(answer, aUnitNum);
+
+	/* destroy variables */
+	delete a;
+    delete aGPU;
+	delete[] aDimSize;
+
+	return cpuTest && gpuTest;
+#else
+	/* destroy variables */
+	delete a;
+	delete[] aDimSize;
+
+	return cpuTest;
+#endif // USE_CUDA
+}
+
+/* other cases */
+/*
+TODO!!
+*/
+
+/* test for Sign Function */
+bool TestSign()
+{
+	XPRINT(0, stdout, "[TEST Sign] set every entry to its sign value \n");
+	bool returnFlag = true, caseFlag = true;
+
+	/* case 1 test */
+	caseFlag = TestSign1();
+
+	if (!caseFlag) {
+		returnFlag = false;
+		XPRINT(0, stdout, ">> case 1 failed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> case 1 passed!\n");
+
+	/* other cases test */
+	/*
+	TODO!!
+	*/
+
+	if (returnFlag) {
+		XPRINT(0, stdout, ">> All Passed!\n");
+	}
+	else
+		XPRINT(0, stdout, ">> Failed!\n");
+
+	XPRINT(0, stdout, "\n");
+
+	return returnFlag;
+}
+
+} // namespace nts(NiuTrans.Tensor)
--- a/source/tensor/test/TSign.h
+++ b/source/tensor/test/TSign.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-12
+*/
+
+#ifndef __TEST_SIGN_H__
+#define __TEST_SIGN_H__
+
+#include "../core/arithmetic/Sign.h"
+
+namespace nts { // namespace nts(NiuTrans.Tensor)
+
+/* test for Sign Function */
+extern "C"
+bool TestSign();
+
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SIGN_H__
--- a/source/tensor/test/TSoftmax.cpp
+++ b/source/tensor/test/TSoftmax.cpp
@@ -58,7 +58,7 @@ bool TestSoftmax1()
    y->SetZeroAll();

    /* call Softmax function */
-    Softmax(x, y, 1);
+    _Softmax(x, y, 1);
    
    /* check result */
    cpuTest = y->CheckData(answer, unitNum, 1e-4F);
@@ -76,7 +76,7 @@ bool TestSoftmax1()
    yGPU->SetZeroAll();

    /* call Softmax function */
-    Softmax(xGPU, yGPU, 1);
+    _Softmax(xGPU, yGPU, 1);
    
    /* check result */
    gpuTest = yGPU->CheckData(answer, unitNum, 1e-4F);
@@ -139,7 +139,7 @@ bool TestSoftmax2()
    dedy->SetZeroAll();

    /* call Softmax function */
-    Softmax(x, y, 1);
+    _Softmax(x, y, 1);
    
    /* call SoftmaxBackward function */
    SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
@@ -167,7 +167,7 @@ bool TestSoftmax2()
    dedyGPU->SetZeroAll();

    /* call Softmax function */
-    Softmax(xGPU, yGPU, 1);
+    _Softmax(xGPU, yGPU, 1);

    /* call SoftmaxBackward function */
    SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);

--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
@@ -53,7 +53,7 @@ bool TestSort1()
    b->SetZeroAll();

    /* call Sort function */
-    Sort(a, b, 0);
+    _Sort(a, b, 0);

    cpuTest = a->CheckData(answer, unitNum);

@@ -70,7 +70,7 @@ bool TestSort1()
    bGPU->SetZeroAll();

    /* call sum function */
-    Sort(aGPU, bGPU, 0);
+    _Sort(aGPU, bGPU, 0);

    /* check results */
    gpuTest = aGPU->CheckData(answer, unitNum);
@@ -121,7 +121,7 @@ bool TestSort2()
    a->SetData(aData, unitNum);

    /* call Sort function */
-    Sort(a, b, 1);
+    _Sort(a, b, 1);

    /* check results */
    cpuTest = a->CheckData(answer, unitNum);
@@ -138,7 +138,7 @@ bool TestSort2()
    aGPU->SetData(aData, unitNum);

    /* call sum function */
-    Sort(aGPU, bGPU, 1);
+    _Sort(aGPU, bGPU, 1);

    /* check results */
    gpuTest = aGPU->CheckData(answer, unitNum);

--- a/source/tensor/test/TSplit.cpp
+++ b/source/tensor/test/TSplit.cpp
@@ -71,7 +71,7 @@ bool TestSplit1()
    t->SetZeroAll();

    /* call split function */
-    Split(s, t, 0, 2);
+    _Split(s, t, 0, 2);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -89,7 +89,7 @@ bool TestSplit1()
 	tGPU->SetZeroAll();

    /* call sum function */
-	Split(sGPU, tGPU, 0, 2);
+	_Split(sGPU, tGPU, 0, 2);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -157,7 +157,7 @@ bool TestSplit2()
    t->SetZeroAll();;

    /* call split function */
-    Split(s, t, 1, 2);
+    _Split(s, t, 1, 2);

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -175,7 +175,7 @@ bool TestSplit2()
 	tGPU->SetZeroAll();

    /* call sum function */
-	Split(sGPU, tGPU, 1, 2);
+	_Split(sGPU, tGPU, 1, 2);

    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -268,7 +268,7 @@ bool TestSplit3()
    tList.Add(t2);

    /* call split function */
-    Split(s, &tList, 1, 2);
+    _Split(s, &tList, 1, 2);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -295,7 +295,7 @@ bool TestSplit3()
 	tList.Add(tGPU2);

 	/* call split function */
-	Split(s, &tList, 1, 2);
+	_Split(s, &tList, 1, 2);

    /* check results */
 	gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

--- a/source/tensor/test/TSum.cpp
+++ b/source/tensor/test/TSum.cpp
@@ -49,16 +49,24 @@ bool TestSum1()
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
    XTensor * b = NewTensor(order, dimSize);
+    XTensor * c = NewTensor(order, dimSize);
+    XTensor * cMe = NewTensor(order, dimSize);
+    XTensor cUser;

    /* initialize variables */
    a->SetData(aData, unitNum);
+    cMe->SetData(aData, unitNum);
    b->SetData(bData, unitNum);
+    c->SetZeroAll();

    /* call sum function */
-    _Sum(a, b, a);
+    _Sum(a, b, c);
+    _SumMe(cMe, b);
+    cUser = Sum(*a, *b);

    /* check results */
-    cpuTest = a->CheckData(answer, unitNum);
+    cpuTest = c->CheckData(answer, unitNum)
+        && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -67,22 +75,34 @@ bool TestSum1()
    /* create tensor */
    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;

    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
+    cMeGPU->SetData(aData, unitNum);
    bGPU->SetData(bData, unitNum);
+    cGPU->SetZeroAll();

    /* call sum function */
-    _Sum(aGPU, bGPU, aGPU);
+    _Sum(aGPU, bGPU, cGPU);
+    _SumMe(cMeGPU, bGPU);
+    cUserGPU = Sum(*aGPU, *bGPU);

    /* check results */
-    gpuTest = aGPU->CheckData(answer, unitNum);
+    gpuTest = cGPU->CheckData(answer, unitNum)
+        && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;
    delete b;
+    delete c;
+    delete cMe;
    delete aGPU;
    delete bGPU;
+    delete cGPU;
+    delete cMeGPU;
    delete[] dimSize;

    return cpuTest && gpuTest;
@@ -90,6 +110,8 @@ bool TestSum1()
    /* destroy variables */
    delete a;
 	delete b;
+	delete c;
+    delete cMe;
    delete[] dimSize;

    return cpuTest;
@@ -124,17 +146,23 @@ bool TestSum2()
    XTensor * a = NewTensor(order, dimSize);
    XTensor * b = NewTensor(order, dimSize);
    XTensor * c = NewTensor(order, dimSize);
+    XTensor * cMe = NewTensor(order, dimSize);
+    XTensor cUser;

-    /* Initalize variables */
+    /* initialize variables */
    a->SetData(aData, unitNum);
+    cMe->SetData(aData, unitNum);
    b->SetData(bData, unitNum);
    c->SetZeroAll();

-    /* call Sum function */
+    /* call sum function */
    _Sum(a, b, c, beta);
+    _SumMe(cMe, b, beta);
+    cUser = Sum(*a, *b, beta);

    /* check results */
-    cpuTest = c->CheckData(answer, unitNum);
+    cpuTest = c->CheckData(answer, unitNum)
+        && cMe->CheckData(answer, unitNum) && cUser.CheckData(answer, unitNum);

 #ifdef USE_CUDA
    /* GPU test */
@@ -144,25 +172,33 @@ bool TestSum2()
    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    XTensor * cGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cMeGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor cUserGPU;

-    /* Initalize variables */
+    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
+    cMeGPU->SetData(aData, unitNum);
    bGPU->SetData(bData, unitNum);
    cGPU->SetZeroAll();

-    /* call Sum function */
+    /* call sum function */
    _Sum(aGPU, bGPU, cGPU, beta);
+    _SumMe(cMeGPU, bGPU, beta);
+    cUserGPU = Sum(*aGPU, *bGPU, beta);

    /* check results */
-    gpuTest = cGPU->CheckData(answer, unitNum);
+    gpuTest = cGPU->CheckData(answer, unitNum)
+        && cMeGPU->CheckData(answer, unitNum) && cUserGPU.CheckData(answer, unitNum);

    /* destroy variables */
    delete a;
    delete b;
    delete c;
+    delete cMe;
    delete aGPU;
    delete bGPU;
    delete cGPU;
+    delete cMeGPU;
    delete[] dimSize;

    return cpuTest && gpuTest;
@@ -171,6 +207,7 @@ bool TestSum2()
    delete a;
    delete b;
    delete c;
+    delete cMe;
    delete[] dimSize;

    return cpuTest;

--- a/source/tensor/test/TSumByColumnTV.cpp
+++ b/source/tensor/test/TSumByColumnTV.cpp
@@ -79,7 +79,7 @@ bool TestSumByColumnTV1()
    b->SetData(bData, bUnitNum);

    /* call SumByColumnTV function */
-    SumByColumnTV(a, b, c);
+    _SumByColumnTV(a, b, c);

    /* check results */
    cpuTest = c->CheckData(answer, cUnitNum);
@@ -99,7 +99,7 @@ bool TestSumByColumnTV1()
    cGPU->SetZeroAll();

    /* call SumByColumnTV function */
-    SumByColumnTV(aGPU, bGPU, cGPU);
+    _SumByColumnTV(aGPU, bGPU, cGPU);

    /* check results */
    gpuTest = cGPU->CheckData(answer, cUnitNum);
@@ -129,94 +129,6 @@ bool TestSumByColumnTV1()
 #endif // USE_CUDA
 }

-/* 
-case 2: test SumByColumnTV function
-sum of a tensor and a vector (column vector) in a column by column manner
-*/
-bool TestSumByColumnTV2()
-{
-    /* a tensor of size (2, 4) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 4;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2, 1) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 1;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][1] = { {1.0F},
-                          {0.0F} };
-    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
-                           {4.0F, 5.0F, 6.0F, 7.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(aOrder, aDimSize);
-    XTensor * b = NewTensor(bOrder, bDimSize);
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-
-    /* call SumByColumnTV function */
-    SumByColumnTV(a, b);
-
-    /* check results */
-    cpuTest = a->CheckData(answer, aUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-
-    /* call SumByColumnTV function */
-    SumByColumnTV(aGPU, bGPU);
-
-    /* check results */
-    gpuTest = aGPU->CheckData(answer, aUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete aGPU;
-    delete bGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
 /* other cases */
 /*
    TODO!!
@@ -237,15 +149,6 @@ bool TestSumByColumnTV()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    /* case 2 test */
-    caseFlag = TestSumByColumnTV2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-
    /* other cases test */
    /*
        TODO!!

--- a/source/tensor/test/TSumByColumnVT.cpp
+++ b/source/tensor/test/TSumByColumnVT.cpp
@@ -80,7 +80,7 @@ bool TestSumByColumnVT1()
    c->SetZeroAll();

    /* call SumByColumnVT function */
-    SumByColumnVT(a, b, c);
+    _SumByColumnVT(a, b, c);
    
    /* check results */
    cpuTest = c->CheckData(answer, cUnitNum);
@@ -100,7 +100,7 @@ bool TestSumByColumnVT1()
    cGPU->SetZeroAll();

    /* call SumByColumnVT function */
-    SumByColumnVT(aGPU, bGPU, cGPU);
+    _SumByColumnVT(aGPU, bGPU, cGPU);
    
    /* check results */
    gpuTest = cGPU->CheckData(answer, cUnitNum);
@@ -130,94 +130,6 @@ bool TestSumByColumnVT1()
 #endif // USE_CUDA
 }

-/* 
-case 2: test SumByColumnVT function
-sum of a vector (column vector) and a tensor in a column by column manner
-*/
-bool TestSumByColumnVT2()
-{
-    /* a tensor of size (2, 1) */
-    int aOrder = 2;
-    int * aDimSize = new int[aOrder];
-    aDimSize[0] = 2;
-    aDimSize[1] = 1;
-
-    int aUnitNum = 1;
-    for (int i = 0; i < aOrder; i++)
-        aUnitNum *= aDimSize[i];
-
-    /* a tensor of size (2, 4) */
-    int bOrder = 2;
-    int * bDimSize = new int[bOrder];
-    bDimSize[0] = 2;
-    bDimSize[1] = 4;
-
-    int bUnitNum = 1;
-    for (int i = 0; i < bOrder; i++)
-        bUnitNum *= bDimSize[i];
-
-    DTYPE aData[2][1] = { {1.0F},
-                          {0.0F} };
-    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE answer[2][1] = { {7.0F},
-                           {22.0F} };
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(aOrder, aDimSize);
-    XTensor * b = NewTensor(bOrder, bDimSize);
-
-    /* initialize variables */
-    a->SetData(aData, aUnitNum);
-    b->SetData(bData, bUnitNum);
-
-    /* call SumByColumnVT function */
-    SumByColumnVT(a, b);
-
-    /* check results */
-    cpuTest = a->CheckData(answer, aUnitNum);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
-
-    /* Initialize variables */
-    aGPU->SetData(aData, aUnitNum);
-    bGPU->SetData(bData, bUnitNum);
-
-    /* call SumByColumnVT function */
-    SumByColumnVT(aGPU, bGPU);
-
-    /* check results */
-    gpuTest = aGPU->CheckData(answer, aUnitNum);
-
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete aGPU;
-    delete bGPU;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    delete[] aDimSize;
-    delete[] bDimSize;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
 /* other cases */
 /*
    TODO!!
@@ -238,15 +150,6 @@ bool TestSumByColumnVT()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    /* case 2 test */
-    caseFlag = TestSumByColumnVT2();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-
    /* other cases test */
    /*
        TODO!!

--- a/source/tensor/test/TTopK.cpp
+++ b/source/tensor/test/TTopK.cpp
@@ -84,11 +84,11 @@ bool TestTopK1()
    /* call TopK function */
    int dim = 0;
    int k = sDimSize[dim];
-    TopK(s, t1, index1, dim, k);
+    _TopK(s, t1, index1, dim, k);

    dim = 1;
    k = sDimSize[dim];
-    TopK(s, t2, index2, dim, k);
+    _TopK(s, t2, index2, dim, k);

    /* check results */
    cpuTest = t1->CheckData(tAnswer1, tUnitNum) && 
@@ -117,11 +117,11 @@ bool TestTopK1()
    /* call TopK function */
    dim = 0;
    k = sDimSize[dim];
-    TopK(sGPU, tGPU1, indexGPU1, dim, k);
+    _TopK(sGPU, tGPU1, indexGPU1, dim, k);
    
    dim = 1;
    k = sDimSize[dim];
-    TopK(sGPU, tGPU2, indexGPU2, dim, k);
+    _TopK(sGPU, tGPU2, indexGPU2, dim, k);
    
    /* check results */
    gpuTest = tGPU1->CheckData(tAnswer1, tUnitNum) && 
@@ -207,7 +207,7 @@ bool TestTopK2()
    /* call TopK function */
    int dim = 1;
    int k = tDimSize[dim];
-    TopK(s, t, index, dim, k);
+    _TopK(s, t, index, dim, k);

    /* check results */
    cpuTest = t->CheckData(tAnswer, tUnitNum) && index->CheckData(indexAnswer, tUnitNum);
@@ -229,7 +229,7 @@ bool TestTopK2()
    /* call TopK function */
    dim = 1;
    k = tDimSize[dim];
-    TopK(sGPU, tGPU, indexGPU, dim, k);
+    _TopK(sGPU, tGPU, indexGPU, dim, k);

    /* check results */
    gpuTest = tGPU->CheckData(tAnswer, tUnitNum) && indexGPU->CheckData(indexAnswer, tUnitNum);

--- a/source/tensor/test/TUnsqueeze.cpp
+++ b/source/tensor/test/TUnsqueeze.cpp
@@ -91,8 +91,8 @@ bool TestUnsqueeze1()
    t2->SetZeroAll();

    /* call Unsqueeze function */
-    Unsqueeze(s, t1, 1, 2);
-    Unsqueeze(s, t2, 2, 2);
+    _Unsqueeze(s, t1, 1, 2);
+    _Unsqueeze(s, t2, 2, 2);

    /* check results */
    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
@@ -112,8 +112,8 @@ bool TestUnsqueeze1()
    tGPU2->SetZeroAll();

    /* call Unsqueeze function */
-    Unsqueeze(sGPU, tGPU1, 1, 2);
-    Unsqueeze(sGPU, tGPU2, 2, 2);
+    _Unsqueeze(sGPU, tGPU1, 1, 2);
+    _Unsqueeze(sGPU, tGPU2, 2, 2);

    /* check results */
    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);

--- a/source/tensor/test/Test.cpp
+++ b/source/tensor/test/Test.cpp
@@ -29,10 +29,13 @@ bool Test()
    bool wrong = false;
    XPRINT(0, stdout, "Testing the XTensor utilites ... \n\n");

+    wrong = !TestAbsolute() || wrong;
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    wrong = !TestConvertDataType() || wrong;
    wrong = !TestCopyIndexed() || wrong;
    wrong = !TestCopyValues() || wrong;
+    wrong = !TestLog() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
    wrong = !TestMatrixMul2DParallel() || wrong;
@@ -52,6 +55,7 @@ bool Test()
    wrong = !TestSelect() || wrong;
    wrong = !TestSetAscendingOrder() || wrong;
    wrong = !TestSetData() || wrong;
+    wrong = !TestSign() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;

--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h
@@ -22,10 +22,13 @@
 #ifndef __TEST_H__
 #define __TEST_H__

+#include "TAbsolute.h"
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TConvertDataType.h"
 #include "TCopyIndexed.h"
 #include "TCopyValues.h"
+#include "TLog.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
 #include "TMatrixMul2DParallel.h"
@@ -45,6 +48,7 @@
 #include "TSelect.h"
 #include "TSetAscendingOrder.h"
 #include "TSetData.h"
+#include "TSign.h"
 #include "TSort.h"
 #include "TSplit.h"
 #include "TSum.h"