merged

abeb3e64 · liyinqiao · dcabc2b0 · 414ff54f · abeb3e64 · abeb3e64
Commit abeb3e64 authored Jul 08, 2018 by liyinqiao
--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
@@ -38,7 +38,7 @@
 #include "XMem.h"
 #include "XHeap.h"
 #include "XBLAS.h"
-#include "core/MergeBlockLists.h"
+#include "core/shape/MergeBlockLists.h"
 #ifdef USE_CUDA
@@ -47,8 +47,8 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-#include "core/FlushToMem.cuh"
+#include "core/utilities/FlushToMem.cuh"
-#include "core/SetAscendingOrder.cuh"
+#include "core/utilities/SetAscendingOrder.cuh"
 #endif
@@ -555,6 +555,27 @@ bool XTensor::CheckData(const void * d, int num, int beg)
    return true;
 }
+bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
+{
+    if (data == NULL || d == NULL)
+        return false;
+    CheckNTErrors(!isSparse, "TODO");
+    CheckNTErrors(num == unitNum - beg, "Illegal size!");
+    DTYPE * valuePrt = (DTYPE*)data;
+    DTYPE value = 0;
+    DTYPE * answerPrt = (DTYPE*)d;
+    for (int i = beg; i < num; i++) {
+        value = ToCPU(devID, valuePrt);
+        if (fabs(value - *answerPrt) > tolerance)
+            return false;
+        valuePrt++;
+        answerPrt++;
+    }
+    return true;
+}
 /* 
 set the cell to the ascending order along a given dimension 
 >> dim - the dimension specified
@@ -697,6 +718,63 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2)
 }
 /*
+get the value of a cell in a 1d tensor in int type
+>> i - index
+<< return - value of cell(i) in int
+*/
+int XTensor::Get1DInt(int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in int type.");
+    int dimSize[1] = {i};
+    void * value = GetCell(dimSize, 1);
+    return ToCPUInt(devID, value);
+}
+/* 
+get the value of a cell in a 2d tensor in int type
+>> ni - row index
+>> mi - column index
+<< return - value of cell(ni, mi) in int
+*/
+ int XTensor::Get2DInt(int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
+    int dims[2] = {ni, mi};
+    void * value = GetCell(dims, 2);
+    return ToCPUInt(devID, value);
+}
+/* 
+get the value of a cell in a 3d tensor in int type
+>> d0 - index of dimension 0
+>> d1 - index of dimension 1
+>> d2 - index of dimension 2
+<< return - value of cell(d0, d1, d2) in int
+*/
+int XTensor::Get3DInt(int d0, int d1, int d2)
+{
+    CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((d1 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
+    int dims[3] = {d0, d1, d2};
+    void * value = GetCell(dims, 3);
+    return ToCPUInt(devID, value);
+}
+/* 
 get the value of a cell in the sparse tensor 
 >> i - i-th tuple in the tuple list of the sparse tensor
 << return - value of the tuple

--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -211,6 +211,9 @@ struct XTensor
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);
+    /* check whether the data array is the same as the answer */
+    bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
    /* set the cell to the ascending order along a given dimension */
    void SetAscendingOrder(int dim);
@@ -220,15 +223,24 @@ struct XTensor
    /* get the pointer to a cell */
    void * GetCell(int index[], int size = -1);
-    /* get the value of a cell in a 1d tensor */
+    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i);
-    /* get the value of a cell in a 2d tensor */
+    /* get the default type value of a cell in a 2d tensor */
    DTYPE Get2D(int ni, int mi);
-    /* get the value of a cell in a 3d tensor */
+    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);
+    /* get the int value of a cell in a 1d tensor */
+    int Get1DInt(int i);
+    /* get the int value of a cell in a 2d tensor */
+    int Get2DInt(int ni, int mi);
+    /* get the int value of a cell in a 3d tensor */
+    int Get3DInt(int d0, int d1, int d2);
    /* get the value of a cell in a sparse tensor */
    DTYPE GetInSparse(int i);

--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
@@ -26,43 +26,49 @@
 #include "../XTensor.h"
-#include "Concatenate.h"
+#include "shape/Concatenate.h"
-#include "ConcatenateSolely.h"
+#include "shape/ConcatenateSolely.h"
-#include "CopyIndexed.h"
+#include "movement/CopyBlocks.h"
-#include "CopyInGrid.h"
+#include "movement/CopyBlocksInGrid.h"
-#include "CopyValues.h"
+#include "movement/CopyBlocksOnSite.h"
-#include "FlushToMem.h"
+#include "movement/CopyData2D.h"
-#include "MakeMergeBlockIndex.h"
+#include "movement/CopyIndexed.h"
-#include "MakeSplitBlockIndex.h"
+#include "movement/CopyInGrid.h"
-#include "MatrixMul.h"
+#include "movement/CopyValues.h"
-#include "MatrixMul2D.h"
+#include "utilities/FlushToMem.h"
-#include "MatrixMul2DMultiTheading.h"
+#include "shape/MakeMergeBlockIndex.h"
-#include "MatrixMul2DParallel.h"
+#include "shape/MakeSplitBlockIndex.h"
-#include "MatrixMulBatched.h"
+#include "arithmetic/MatrixMul.h"
-#include "MatrixMULBatchedCPU.h"
+#include "arithmetic/MatrixMul2D.h"
-#include "Merge.h"
+#include "arithmetic/MatrixMul2DMultiTheading.h"
-#include "MergeBlockLists.h"
+#include "arithmetic/MatrixMul2DParallel.h"
-#include "Multiply.h"
+#include "arithmetic/MatrixMulBatched.h"
-#include "Negate.h"
+#include "arithmetic/MatrixMULBatchedCPU.h"
-#include "Normalize.h"
+#include "shape/Merge.h"
-#include "Permute.h"
+#include "shape/MergeBlockLists.h"
-#include "Power.h"
+#include "arithmetic/Multiply.h"
-#include "ReduceMax.h"
+#include "arithmetic/Negate.h"
-#include "ReduceMean.h"
+#include "math/Normalize.h"
-#include "ReduceStandardVariance.h"
+#include "shape/Permute.h"
-#include "ReduceSum.h"
+#include "math/Power.h"
-#include "ReduceSumSquared.h"
+#include "reduce/ReduceMax.h"
-#include "ReduceVariance.h"
+#include "reduce/ReduceMean.h"
-#include "ScaleAndShift.h"
+#include "reduce/ReduceStandardVariance.h"
-#include "SetData.h"
+#include "reduce/ReduceSum.h"
-#include "Sort.h"
+#include "reduce/ReduceSumSquared.h"
-#include "Split.h"
+#include "reduce/ReduceVariance.h"
-#include "Sum.h"
+#include "math/ScaleAndShift.h"
-#include "SumByColumnTV.h"
+#include "getandset/Select.h"
-#include "SumByColumnVT.h"
+#include "getandset/SetData.h"
-#include "TopK.h"
+#include "sort/Sort.h"
-#include "Unsqueeze.h"
+#include "shape/Split.h"
-#include "XMatrixSegment.h"
+#include "arithmetic/Sum.h"
-#include "XTensorBLAS.h"
+#include "arithmetic/SumByColumnTV.h"
+#include "arithmetic/SumByColumnVT.h"
+#include "sort/TopK.h"
+#include "shape/Transpose.h"
+#include "shape/Unsqueeze.h"
+#include "utilities/XMatrixSegment.h"
+#include "arithmetic/XTensorBLAS.h"
 #endif // __CHEADER_H__
\ No newline at end of file
--- a/source/core/XTensorCore.h
+++ b/source/core/XTensorCore.h
@@ -219,9 +219,8 @@ public:
    /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
    void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
-    /*******************************************************************
+    /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
-    segmentation and parallel processing for 2d tensors (i.e., matrices)
-    */
    /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
    static
    void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);

--- a/source/core/MatrixMULBatchedCPU.cpp
+++ b/source/core/MatrixMULBatchedCPU.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMULBatchedCPU.h"
 #include "MatrixMul2D.h"
 #include "XTensorBLAS.h"
@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> transposedA - indicate whether the matrix a is transposed
 >> b - another list of input matrices (2d tensors)
 >> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
 >> alpha - scalar
 >> beta - scalar
->> c - output matrix (2d tensor)
 */
 void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
                         XList * b, MATRIX_TRANS_TYPE transposedB,
@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        }
    }
-    //if(isUniform){
-    //}
-    //else{
    for (int i = 0; i < a->count; i++) {
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);

--- a/source/core/MatrixMULBatchedCPU.h
+++ b/source/core/MatrixMULBatchedCPU.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHEDCPU_H__
 #define __MATRIXMULBATCHEDCPU_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
 #include "MatrixMULBatchedCPU.h"
@@ -65,13 +65,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XLink::AddParamToHeadInt(c, transposedB);
    XLink::AddParamToHead(c, alpha);
    XLink::AddParamToHead(c, beta);
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
+    int cn = c->dimSizeRDI[1];
-    int cn = c->dimSize[0];
+    int cm = c->dimSizeRDI[0];
-    int cm = c->dimSize[1];
    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -87,13 +86,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cBlockNum = 1;
    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }
    for (int i = 2; i < b->order; i++) {
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }
@@ -101,9 +100,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] };
+    int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
-    int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] };
+    int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
-    int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] };
+    int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };
    bool isSparseMul = false;

--- a/source/core/MatrixMul.h
+++ b/source/core/MatrixMul.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL_H__
 #define __MATRIXMUL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2D.cpp
+++ b/source/core/MatrixMul2D.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "MatrixMul2DParallel.h"

--- a/source/core/MatrixMul2D.cu
+++ b/source/core/MatrixMul2D.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "XTensorBLAS.h"
@@ -37,11 +37,13 @@ c = a * b * \alpha
 >> aColSize - column size of matrix a
 >> aRowSize - row size of matrix a
 >> b - a sparse matrix
->> transposedA - indicates whether b is transposed
+>> transposedB - indicates whether b is transposed
 >> bNonZeroNum - number of non-zero items in b
 >> bColSize - column size of matrix b
 >> bRowSize - row size of matrix b
 >> c - the resulting (dense) matrix
+>> cColSize - column size of matrix c
+>> cRowSize - row size of matrix c
 >> alpha - the scaling factor
 */
 extern "C" __global__
@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
-        //cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
        /* !!!! might have problems */
@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            if (beta == 0)
                c->SetZeroAll();
            else if (beta != 1.0F) {
-                //XTensor::ScaleAndShift(c, beta, 0);
                ShowNTErrors("TODO!");
            }

--- a/source/core/MatrixMul2D.cuh
+++ b/source/core/MatrixMul2D.cuh
--- a/source/core/MatrixMul2D.h
+++ b/source/core/MatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2D_H__
 #define __MATRIXMUL2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.cpp
+++ b/source/core/MatrixMul2DMultiTheading.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DMultiTheading.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.h
+++ b/source/core/MatrixMul2DMultiTheading.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DMULTITHEADING_H__
 #define __MATRIXMUL2DMULTITHEADING_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.cpp
+++ b/source/core/MatrixMul2DParallel.cpp
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DParallel.h"
 #include "MatrixMul2DMultiTheading.h"
-#include "XMatrixSegment.h"
+#include "../utilities/XMatrixSegment.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.h
+++ b/source/core/MatrixMul2DParallel.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DPARALLEL_H__
 #define __MATRIXMUL2DPARALLEL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMulBatched.h"
 #include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
@@ -41,6 +41,7 @@ where trans() returns the transposed matrix if the flag is fired
 >> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
+>> parallelRunner - parallel processing module
 */
 void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      XTensor * b, MATRIX_TRANS_TYPE transposedB,
@@ -59,13 +60,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XLink::AddParamToHeadInt(c, transposedB);
    XLink::AddParamToHead(c, alpha);
    XLink::AddParamToHead(c, beta);
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
+    int cn = c->dimSizeRDI[1];
-    int cn = c->dimSize[0];
+    int cm = c->dimSizeRDI[0];
-    int cm = c->dimSize[1];
    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -87,9 +87,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
+    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
-    int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
+    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
-    int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
+    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
    for (int p = 0; p < blockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
@@ -114,7 +114,8 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);
-        CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
+        CudaBLASMatrixMULList(handle,
 							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,

--- a/source/core/MatrixMulBatched.h
+++ b/source/core/MatrixMulBatched.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHED_H__
 #define __MATRIXMULBATCHED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Multiply.cpp
+++ b/source/core/Multiply.cpp
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)

--- a/source/core/Multiply.cu
+++ b/source/core/Multiply.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
+>> alpha - the coefficient
 >> stride - the number of items we go over when move next along the leading dimension in a block
 >> ldSizeA - size of the leading dimension of a
 >> ldSizeB - size of the leading dimension of b

--- a/source/core/Multiply.cuh
+++ b/source/core/Multiply.cuh
--- a/source/core/Multiply.h
+++ b/source/core/Multiply.h
@@ -22,7 +22,7 @@
 #ifndef __MULTIPLY_H__
 #define __MULTIPLY_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Negate.cpp
+++ b/source/core/Negate.cpp
@@ -19,15 +19,15 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-    set every entry to its minus value
+set every entry to its minus value
-    >> a - the tensor we are processing
+>> a - the tensor we are processing
 */
 void Negate(XTensor * a)
 {

--- a/source/core/Negate.cu
+++ b/source/core/Negate.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"
@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
 }
 /*
-    set each entry to its negtive value (CUDA Kernel)
+set each entry to its negtive value (CUDA Kernel)
-    This is for float16 computation
+This is for float16 computation
-    >> d - pointer to the data array
+>> d - pointer to the data array
-    >> size - size of the data array
+>> size - size of the data array
 */
 __global__
 void KernelNegate(__half * d, int size)

--- a/source/core/Negate.cuh
+++ b/source/core/Negate.cuh
--- a/source/core/Negate.h
+++ b/source/core/Negate.h
@@ -22,7 +22,7 @@
 #ifndef __NEGATE_H__
 #define __NEGATE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sum.cpp
+++ b/source/core/Sum.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Sum.h"
 #include "Sum.cuh"

--- a/source/core/Sum.cu
+++ b/source/core/Sum.cu
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
 #include "Sum.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 summation of data arrays (CUDA Kernel)
 c = a  + b * \beta

--- a/source/core/Sum.cuh
+++ b/source/core/Sum.cuh
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-		/* summation of data arrays (CUDA Kernel) */
+/* summation of data arrays (CUDA Kernel) */
 extern "C" __global__
 void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);

--- a/source/core/Sum.h
+++ b/source/core/Sum.h
@@ -22,7 +22,7 @@
 #ifndef __SUM_H__
 #define __SUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.cpp
+++ b/source/core/SumByColumnTV.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cu
+++ b/source/core/SumByColumnTV.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cuh
+++ b/source/core/SumByColumnTV.cuh
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_CUH__
 #define __REDUCEMAX_CUH__
-#include "ReduceMax.h"
+#include "../reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.h
+++ b/source/core/SumByColumnTV.h
@@ -22,7 +22,7 @@
 #ifndef __SUMBYCOLUMNTV_H__
 #define __SUMBYCOLUMNTV_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnVT.cpp
+++ b/source/core/SumByColumnVT.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"

--- a/source/core/SumByColumnVT.cu
+++ b/source/core/SumByColumnVT.cu
@@ -19,14 +19,15 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 summation of a vector (column vector) and a tensor
 c = a + \sum{col} b_col * \beta

--- a/source/core/SumByColumnVT.cuh
+++ b/source/core/SumByColumnVT.cuh
--- a/source/core/SumByColumnVT.h
+++ b/source/core/SumByColumnVT.h
@@ -22,11 +22,10 @@
 #ifndef __SUMBYCOLUMNVT_H__
 #define __SUMBYCOLUMNVT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a (column) vector and a tensor */
 extern "C"
 void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

--- a/source/core/XTensorBLAS.cpp
+++ b/source/core/XTensorBLAS.cpp
@@ -20,8 +20,8 @@
 */
 #include "XTensorBLAS.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XBLAS.h"
+#include "../../XBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.cu
+++ b/source/core/XTensorBLAS.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "XTensorBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.h
+++ b/source/core/XTensorBLAS.h
@@ -22,7 +22,7 @@
 #ifndef __XTENSORBLAS_H__
 #define __XTENSORBLAS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ConvertDataType.cu
+++ b/source/core/ConvertDataType.cu
@@ -19,8 +19,8 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Select.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -33,7 +33,7 @@ c = select(a)
 >> dim - the dimension along with which we do the job
 >> low - lower bound
 >> high - higher bound.
-          Note that range [1,3] means that we select 1 and 2.
+Note that range [1,3] means that we select 1 and 2.
 */
 void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
 {
@@ -48,7 +48,7 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    for(int i = 0; i < a->order; i++){
        if(i == dim){
            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
-            CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
            CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
@@ -62,20 +62,24 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    XLink::AddParamToHeadInt(c, high);
    int stride = 1;
-    for(int i = 0; i < dim; i++)
+    int dimRDI = a->order - dim - 1;
+    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];
+    int copyTimes = 1;
+    for (int i = dimRDI + 1; i < a->order; i++) 
+        copyTimes *= a->dimSizeRDI[i];
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
    char * s = (char*)a->data + stride * low * a->unitSize;
    char * t = (char*)c->data;
-    for(int i = 0; i < high - low; i++){
+    for(int i = 0; i < copyTimes; i++){
        XMemCopy(t, c->devID, s, a->devID, blockSize);
        s += stepSizeS;
        t += stepSizeT;
    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/Select.cu
+++ b/source/core/Select.cu
--- a/source/core/Select.cuh
+++ b/source/core/Select.cuh
--- a/source/core/Select.h
+++ b/source/core/Select.h
@@ -22,7 +22,7 @@
 #ifndef __SELECT_H__
 #define __SELECT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetData.cpp
+++ b/source/core/SetData.cpp
@@ -21,7 +21,7 @@
 */
 #include "SetData.h"
-#include "CopyValues.h"
+#include "../movement/CopyValues.h"
 #if !defined( WIN32 ) && !defined( _WIN32 )
    #include "sys/time.h"
@@ -68,7 +68,8 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
            ShowNTErrors("TODO");
        }
    }
-    /* GPU code
+    /* 
+    GPU code
    The trick here is that initialize the data on a temperary tensor on CPU.
    The CPU data is then copied to GPU.
    TODO: generate data points on GPUs straightforwardly.

--- a/source/core/SetData.cu
+++ b/source/core/SetData.cu
--- a/source/core/SetData.cuh
+++ b/source/core/SetData.cuh
--- a/source/core/SetData.h
+++ b/source/core/SetData.h
@@ -23,7 +23,7 @@
 #ifndef __SETDATA_H__
 #define __SETDATA_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Normalize.cpp
+++ b/source/core/Normalize.cpp
@@ -20,11 +20,12 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/core/Normalize.cu
+++ b/source/core/Normalize.cu
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 normalized the data with normal distribution (kernel code). For an input x,

--- a/source/core/Normalize.cuh
+++ b/source/core/Normalize.cuh
@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* normalized the data with normal distribution (Kernel code). For an input x,
+/* 
+normalized the data with normal distribution (Kernel code). For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
    DTYPE * a, DTYPE * b, DTYPE epsilon,
    int stride, int strideNum, int blockNum);
-/* normalized the data with normal distribution. For an input x,
+/* 
+normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */

--- a/source/core/Normalize.h
+++ b/source/core/Normalize.h
@@ -22,7 +22,7 @@
 #ifndef __NORMALIZE_H__
 #define __NORMALIZE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Power.cpp
+++ b/source/core/Power.cpp
@@ -20,15 +20,16 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 get the power(a, p)
 >> a - the tensor
->> power - as it is
+>> p - as it is
 */
 void Power(XTensor * a, DTYPE p)
 {

--- a/source/core/Power.cu
+++ b/source/core/Power.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"
@@ -87,9 +87,6 @@ __global__
 void KernelPower(__half * d, __half p, int size)
 {
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    //int i = blockDim.x * blockIdx.x + threadIdx.x;
-    //if (i < size)
-    //    d[i] = hpow(d[i], p);
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
        }
        else if (p != (DTYPE)1.0) {
            ShowNTErrors("TODO!");
-            //unsigned short p2 = FloatToFloat16(p);
-            //__half * pp = (__half*)&p2;
-            //KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
        }
    }
    else {

--- a/source/core/Power.cuh
+++ b/source/core/Power.cuh
--- a/source/core/Power.h
+++ b/source/core/Power.h
@@ -22,7 +22,7 @@
 #ifndef __POWER_H__
 #define __POWER_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.cpp
+++ b/source/core/ScaleAndShift.cpp
@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 scale and shift all tensor entires
 p = p * scale + shift
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/ScaleAndShift.cu
+++ b/source/core/ScaleAndShift.cu
@@ -21,7 +21,7 @@
 #include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
 /* 
 scale and shift all matrix entires
 p = p * scale + shift
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/ScaleAndShift.cuh
+++ b/source/core/ScaleAndShift.cuh
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_CUH__
 #define __SCALEANDSHIFT_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.h
+++ b/source/core/ScaleAndShift.h
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_H__
 #define __SCALEANDSHIFT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocks.cpp
+++ b/source/core/CopyBlocks.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocks.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksSelected.cuh"
@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, 
    else {
        int devID = myMem != NULL ? myMem->devID : -1;
-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0; i < blockNum; i++) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);

--- a/source/core/CopyBlocks.h
+++ b/source/core/CopyBlocks.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_H__
 #define __COPYBLOCKS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cpp
+++ b/source/core/CopyBlocksInGrid.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyBlocksInGrid.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksInGrid.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cu
+++ b/source/core/CopyBlocksInGrid.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksInGrid.h"
 #include "CopyBlocksInGrid.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cuh
+++ b/source/core/CopyBlocksInGrid.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_CUH__
 #define __COPYBLOCKSINGRID_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.h
+++ b/source/core/CopyBlocksInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_H__
 #define __COPYBLOCKSINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cpp
+++ b/source/core/CopyBlocksOnSite.cpp
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 copy a number of blocks to target positions. Here we assume that
 all the data has been on the device (CPU/GPU) already.
@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
    else {
        int devID = myMem != NULL ? myMem->devID : -1;
-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + b, devID, blockSize);

--- a/source/core/CopyBlocksOnSite.cu
+++ b/source/core/CopyBlocksOnSite.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cuh
+++ b/source/core/CopyBlocksOnSite.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_CUH__
 #define __COPYBLOCKS_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.h
+++ b/source/core/CopyBlocksOnSite.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSONSITE_H__
 #define __COPYBLOCKSONSITE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cu
+++ b/source/core/CopyBlocksSelected.cu
@@ -21,8 +21,8 @@
 #include "CopyBlocks.h"
 #include "CopyBlocksSelected.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cuh
+++ b/source/core/CopyBlocksSelected.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSSELECTED_CUH__
 #define __COPYBLOCKSSELECTED_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.cpp
+++ b/source/core/CopyData2D.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyData2D.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.h
+++ b/source/core/CopyData2D.h
@@ -22,7 +22,7 @@
 #ifndef __COPYDATA2D_H__
 #define __COPYDATA2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyInGrid.cpp
+++ b/source/core/CopyInGrid.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyInGrid.h"
 #include "CopyBlocksInGrid.h"
@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
 in the k-th grid
 >> blockDim - leading dimension of blocks
 >> blockNumInGrid - number of blocks in each grid
->> isOnDev - indicates whether the index is on the device already
+>> isIndexOnDev - indicates whether the index is on the device already
 */
 void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {

--- a/source/core/CopyInGrid.h
+++ b/source/core/CopyInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINGRID_H__
 #define __COPYINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyIndexed.cpp
+++ b/source/core/CopyIndexed.cpp
@@ -36,6 +36,7 @@ copy indexed sub-tensors
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, e.g.,
 for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - whether copy indexed operation was successful
 */
 bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {

--- a/source/core/CopyIndexed.h
+++ b/source/core/CopyIndexed.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINDEXED_H__
 #define __COPYINDEXED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cpp
+++ b/source/core/CopyValues.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XName.h"
+#include "../../XName.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"

--- a/source/core/CopyValues.cu
+++ b/source/core/CopyValues.cu
@@ -21,8 +21,8 @@
 #include "CopyValues.h"
 #include "CopyValues.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cuh
+++ b/source/core/CopyValues.cuh
@@ -22,13 +22,12 @@
 #ifndef __COPYVALUES_CUH__
 #define __COPYVALUES_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/**************************************/
 /* copy all elements from a source matrix to a target matrix */
 extern "C"
 bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);

--- a/source/core/CopyValues.h
+++ b/source/core/CopyValues.h
@@ -22,7 +22,7 @@
 #ifndef __COPYVALUES_H__
 #define __COPYVALUES_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMax.cpp
+++ b/source/core/ReduceMax.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"

--- a/source/core/ReduceMax.cu
+++ b/source/core/ReduceMax.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"
@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = max_{0<=j<strideNum} input_{i,j}
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -89,29 +85,25 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
 }
- /*
+/*
- reduce a tensor to another that keeps the max value along a dimension  - slow version
+reduce a tensor to another that keeps the max value along a dimension  - slow version
+Given a block of data, we go over each dimension i in the stride and we have
- Given a block of data, we go over each dimension i in the stride and we have
+sum_i = max_{0<=j<strideNum} input_{i,j}
+where we can view the block as a matrix and input_{i,j} represent the item at the
- sum_i = max_{0<=j<strideNum} input_{i,j}
+crossing of the i-th columne and the j-th row.
+>> input - the input array (representing a tensor)
- where we can view the block as a matrix and input_{i,j} represent the item at the
+>> output - the sum over each block. NOTE: output is also an array
- crossing of the i-th columne and the j-th row.
+>> stride - stride that we need to move to the next item
+>> strideNum - how many strides we need to finish the reduce
- >> input - the input array (representing a tensor)
+>> reducedStrideNum - the number of strides after reducation
- >> output - the sum over each block. NOTE: output is also an array
+>> blockSize - size of the block (i.e., stride * strideNum)
- >> stride - stride that we need to move to the next item
+>> blockNum - how many blocks
- >> strideNum - how many strides we need to finish the reduce
+*/
- >> reducedStrideNum - the number of strides after reducation
+__global__
- >> blockSize - size of the block (i.e., stride * strideNum)
+void KernelReduceMax(__half * input, __half * output,
- >> blockNum - how many blocks
- */
- __global__
- void KernelReduceMax(__half * input, __half * output,
        int stride, int strideNum, int reducedStrideNum,
        int blockSize, int blockNum)
- {
+{
    int idx = threadIdx.x * blockDim.y + threadIdx.y;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
@@ -164,7 +156,6 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
 }
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - fast version
 >> input - the input array (representing a tensor)
@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
 >> output - the output tensor
 >> dim - which dimension to reduce

--- a/source/core/ReduceMax.cuh
+++ b/source/core/ReduceMax.cuh
--- a/source/core/ReduceMax.h
+++ b/source/core/ReduceMax.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_H__
 #define __REDUCEMAX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMean.cpp
+++ b/source/core/ReduceMean.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceMean.h"
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
 mean = (1/n) * sum_i input_i
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
    ScaleAndShift(output, (DTYPE)1/num, 0);
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/ReduceMean.h
+++ b/source/core/ReduceMean.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMEAN_H__
 #define __REDUCEMEAN_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceStandardVariance.h
+++ b/source/core/ReduceStandardVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESTANDARDVARIANCE_H__
 #define __REDUCESTANDARDVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cpp
+++ b/source/core/ReduceSum.cpp
@@ -22,7 +22,7 @@
 #include <math.h>
 #include "ReduceSum.h"
 #include "ReduceSum.cuh"
-#include "../XName.h"
+#include "../../XName.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cu
+++ b/source/core/ReduceSum.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceSum.cuh"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 This is for float16 reduction.
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - fast version
 This is for float16 reduction
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item

--- a/source/core/ReduceSum.cuh
+++ b/source/core/ReduceSum.cuh
--- a/source/core/ReduceSum.h
+++ b/source/core/ReduceSum.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUM_H__
 #define __REDUCESUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSumSquared.cpp
+++ b/source/core/ReduceSumSquared.cpp
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 squared sum of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ReduceSumSquared.h
+++ b/source/core/ReduceSumSquared.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUMSQUARED_H__
 #define __REDUCESUMSQUARED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceVariance.cpp
+++ b/source/core/ReduceVariance.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceVariance.h"
@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 variance of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 variance = 1/n * \sum_i (a_i - mean)^2
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ReduceVariance.h
+++ b/source/core/ReduceVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEVARIANCE_H__
 #define __REDUCEVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/Concatenate.cpp
+++ b/source/core/Concatenate.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Concatenate.h"
 #include "Merge.h"
 #include "ConcatenateSolely.h"
@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
 /*
 concatenate two tensors along a given dimension
+>> smallA - one tensor for concatenation
+>> smallB - the other tensor for concatenation
+>> big - the resulting tensor
+>> dim - which dimension we perform the concatenation
 */
 void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
 {

--- a/source/core/Concatenate.h
+++ b/source/core/Concatenate.h
@@ -22,14 +22,15 @@
 #ifndef __CONCATENATE_H__
 #define __CONCATENATE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 concatenate a list of tensors along a given dimension
 Note that this is actually a wrapper that selects "ConcatenateSolely"
-or "Merge" by means of the tensor shapes */
+or "Merge" by means of the tensor shapes 
+*/
 void Concatenate(XList * smalls, XTensor * big, int dim);
 /* concatenate two tensors along a given dimension */

--- a/source/core/ConcatenateSolely.cpp
+++ b/source/core/ConcatenateSolely.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "ConcatenateSolely.h"
 #include "MergeBlockLists.h"
@@ -69,9 +69,11 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
    int offset = 0;
-    /* two strategies are used - we can either resort to memcpy2d for the case of
+    /* 
+    two strategies are used - we can either resort to memcpy2d for the case of
    concatenation of a few items, or use MergeBlockLists to merge a large number
-    of data blocks */
+    of data blocks 
+    */
    if (smalls->count <= MIN_TENSOR_CAT_NUM) {
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);

--- a/source/core/ConcatenateSolely.h
+++ b/source/core/ConcatenateSolely.h
@@ -22,11 +22,10 @@
 #ifndef __CONCATENATESOLELY_H__
 #define __CONCATENATESOLELY_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* concatenate a list of tensors along a given dimension */
 extern "C"
 void ConcatenateSolely(XList * smalls, XTensor * big, int dim);

--- a/source/core/MakeMergeBlockIndex.cpp
+++ b/source/core/MakeMergeBlockIndex.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeMergeBlockIndex.h"
 #include "MakeMergeBlockIndex.cuh"

--- a/source/core/MakeMergeBlockIndex.cu
+++ b/source/core/MakeMergeBlockIndex.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeMergeBlockIndex.h"
 #include "MakeMergeBlockIndex.cuh"

--- a/source/core/MakeMergeBlockIndex.cuh
+++ b/source/core/MakeMergeBlockIndex.cuh
@@ -22,7 +22,7 @@
 #ifndef __CUDAMAKEMERGEBLOCKINDEX_CUH__
 #define __CUDAMAKEMERGEBLOCKINDEX_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MakeMergeBlockIndex.h
+++ b/source/core/MakeMergeBlockIndex.h
@@ -22,7 +22,7 @@
 #ifndef __MAKEMERGEBLOCKINDEX_H__
 #define __MAKEMERGEBLOCKINDEX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MakeSplitBlockIndex.cpp
+++ b/source/core/MakeSplitBlockIndex.cpp
@@ -19,11 +19,12 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeSplitBlockIndex.h"
 #include "MakeSplitBlockIndex.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 set target data block index for the data movement in split
 >> blockIndex - block index

--- a/source/core/MakeSplitBlockIndex.cu
+++ b/source/core/MakeSplitBlockIndex.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeSplitBlockIndex.h"
 #include "MakeSplitBlockIndex.cuh"
@@ -51,6 +51,7 @@ void KernelMakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSiz
 /*
 set target data block index for the data movement in split
+>> devID - device id
 >> blockIndex - block index
 >> splitNum - number of splits
 >> blockSplitSize - size of the splitted block

--- a/source/core/MakeSplitBlockIndex.cuh
+++ b/source/core/MakeSplitBlockIndex.cuh
--- a/source/core/MakeSplitBlockIndex.h
+++ b/source/core/MakeSplitBlockIndex.h
@@ -22,7 +22,7 @@
 #ifndef __MAKESPLITBLOCKINDEX_H__
 #define __MAKESPLITBLOCKINDEX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Merge.cpp
+++ b/source/core/Merge.cpp
@@ -19,16 +19,15 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Merge.h"
 #include "MakeMergeBlockIndex.h"
-#include "CopyBlocksOnSite.h"
+#include "../movement/CopyBlocksOnSite.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M)
 >> s - the source tensor

--- a/source/core/Merge.h
+++ b/source/core/Merge.h
@@ -22,7 +22,7 @@
 #ifndef __MERGE_H__
 #define __MERGE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MergeBlockLists.cpp
+++ b/source/core/MergeBlockLists.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "MergeBlockLists.h"
 #include "MergeBlockLists.cuh"

--- a/source/core/MergeBlockLists.cu
+++ b/source/core/MergeBlockLists.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MergeBlockLists.h"
 #include "MergeBlockLists.cuh"
@@ -34,10 +34,9 @@ copy a number of blocks (of different sizes) to target positions
 >> sourceBlockSizes - the size of the block_i
 >> sourceBlockNum - number of blocks to merge
 >> targetList - list of data arrays to copy to
->> target - target data array
 */
 __global__
-    void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
+void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
 {
    __shared__ int iBlockSizes[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE * iSourceList[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -82,7 +81,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
    int minBlockSize = MAX_INT;
    int maxBlockSize = -MAX_INT;
-    //int realMinBlockSize = 1;
    int realMaxBlockSize = 1;
    DTYPE ** sourceArrays = new DTYPE*[newBlockListSize];
    DTYPE ** targetArrays = new DTYPE*[newBlockListSize];
@@ -110,7 +108,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
    CheckNTErrors((minBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
    CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
-    //realMinBlockSize = minBlockSize/sizeof(DTYPE);
    realMaxBlockSize = maxBlockSize / sizeof(DTYPE);
    int cudaGridSizes[3];
@@ -120,31 +117,16 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
                          cudaGridSizes, cudaBlockSizes);
    myMem->SetPinBuf();
-    //MTYPE offset0 = myMem->bufUsed;
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);
-    //MTYPE offset1 = myMem->bufUsed;
    DTYPE ** sourceArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
-    //MTYPE offset2 = myMem->bufUsed;
    DTYPE ** targetArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);
-    //MTYPE bufSize = myMem->bufUsed - offset0;
-    //char * CPUBuf = new char[bufSize];
-    //memset(CPUBuf, 0 , bufSize);
-    //memcpy(CPUBuf, sizes, sizeof(int) * newBlockListSize);
-    //memcpy(CPUBuf + (offset1 - offset0), sourceArrays, sizeof(DTYPE*) * newBlockListSize);
-    //memcpy(CPUBuf + (offset2 - offset0), targetArrays, sizeof(DTYPE*) * newBlockListSize);
    XMemCopy(sizesGPU, myMem->devID, sizes, -1, sizeof(int) * newBlockListSize);
    XMemCopy(sourceArraysGPU, myMem->devID, sourceArrays, -1, sizeof(DTYPE*) * newBlockListSize);
    XMemCopy(targetArraysGPU, myMem->devID, targetArrays, -1, sizeof(DTYPE*) * newBlockListSize);
-    /* it is VERY tricky here because we squeeze three data copies into one */
-    //XMemCopy(sizesGPU, myMem->devID, CPUBuf, -1, bufSize);
    KernelCopyBlockLists << <dim3(cudaGridSizes[0], cudaGridSizes[1]), dim3(cudaBlockSizes[0], cudaBlockSizes[1]) >> >
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);
@@ -154,7 +136,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
    delete[] targetArrays;
    delete[] sizes;
    delete[] offsets;
-    //delete[] CPUBuf;
 }
 #endif // USE_CUDA

--- a/source/core/MergeBlockLists.cuh
+++ b/source/core/MergeBlockLists.cuh
--- a/source/core/MergeBlockLists.h
+++ b/source/core/MergeBlockLists.h
@@ -22,7 +22,7 @@
 #ifndef __MERGEBLOCKLISTS_H__
 #define __MERGEBLOCKLISTS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Permute.cpp
+++ b/source/core/Permute.cpp
--- a/source/core/Permute.h
+++ b/source/core/Permute.h
@@ -22,7 +22,7 @@
 #ifndef __PERMUTE_H__
 #define __PERMUTE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Split.cpp
+++ b/source/core/Split.cpp
@@ -19,11 +19,11 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "Split.h"
 #include "MakeSplitBlockIndex.h"
-#include "CopyBlocksOnSite.h"
+#include "../movement/CopyBlocksOnSite.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Split.h
+++ b/source/core/Split.h
@@ -22,11 +22,12 @@
 #ifndef __SPLIT_H__
 #define __SPLIT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
+extern "C"
 void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
 /* split a big tensor into small tensors */

--- a/source/core/Transpose.cpp
+++ b/source/core/Transpose.cpp
--- a/source/core/Transpose.h
+++ b/source/core/Transpose.h
@@ -23,7 +23,7 @@
 #ifndef __TRANSPOSE_H__
 #define __TRANSPOSE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Unsqueeze.cpp
+++ b/source/core/Unsqueeze.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Unsqueeze.h"
 #include "MergeBlockLists.h"
 #include "Unsqueeze.cuh"

--- a/source/core/Unsqueeze.cu
+++ b/source/core/Unsqueeze.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Unsqueeze.h"
 #include "Unsqueeze.cuh"

--- a/source/core/Unsqueeze.cuh
+++ b/source/core/Unsqueeze.cuh
--- a/source/core/Unsqueeze.h
+++ b/source/core/Unsqueeze.h
@@ -22,7 +22,7 @@
 #ifndef __UNSQUEEZE_H__
 #define __UNSQUEEZE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sort.cpp
+++ b/source/core/Sort.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Sort.h"
 #include "Sort.cuh"

--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Sort.h"
 #include "Sort.cuh"
@@ -235,13 +235,16 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
    int m = GetNextPower2(strideNum);
    int n = stride * blockNum;
-    void * buf = mem->AllocBuf(mem->devID, n * m * a->unitSize);
+    void * buf = mem != NULL ? mem->AllocBuf(a->devID, n * m * a->unitSize) : XMemAlloc(a->devID, n * m * a->unitSize);
-    void * bufIndex = (indexA != NULL && indexB != NULL) ? mem->AllocBuf(mem->devID, n * m * sizeof(int)) : NULL;
+    void * bufIndex = NULL;
+    if (indexA != NULL && indexB != NULL) {
+        bufIndex = mem != NULL ? mem->AllocBuf(a->devID, n * m * sizeof(int)) : XMemAlloc(a->devID, n * m * sizeof(int));
+    }
    int cudaGrids[3];
    int cudaBlocks[3];
-    GDevs.GetCudaThread(mem->devID, m * n, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread(a->devID, m * n, cudaGrids, cudaBlocks);
    int devIDBackup;
    ProtectCudaDev(a->devID, devIDBackup);
@@ -250,7 +253,7 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
    KernelSetDataArray<DTYPE> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
                                ((DTYPE*)buf, DTYPE_MIN, m * n);
-    GDevs.GetCudaThread2D(mem->devID, strideNum, n, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread2D(a->devID, strideNum, n, MAX_INT, cudaGrids, cudaBlocks);
    /* reorganize the data into a matrix */
    KernelReorganize<DTYPE> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
@@ -261,7 +264,7 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
        KernelReorganize<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (indexA->data, bufIndex, stride, strideNum, blockNum, m, n);
-    GDevs.GetCudaThread2D(mem->devID, m, n, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread2D(a->devID, m, n, MAX_INT, cudaGrids, cudaBlocks);
    /* bitonic sorting */
    for (int i = 2; i <= m; i <<= 1) {
@@ -277,7 +280,7 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
        }
    }
-    GDevs.GetCudaThread2D(mem->devID, k, n, MAX_INT, cudaGrids, cudaBlocks);
+    GDevs.GetCudaThread2D(a->devID, k, n, MAX_INT, cudaGrids, cudaBlocks);
    /* copy result to the output tensor */
    KernelReorganizeBack<DTYPE> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
@@ -287,9 +290,15 @@ void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, i
        KernelReorganizeBack<int> << <dim3(cudaGrids[1], cudaGrids[0]), dim3(cudaBlocks[1], cudaBlocks[0]) >> >
                                      (bufIndex, indexB->data, m, n, stride, k, blockNum);
-    mem->ReleaseBuf(mem->devID, n * m * a->unitSize);
+    if (mem != NULL)
+        mem->ReleaseBuf(a->devID, n * m * a->unitSize);
+    else
+        XMemFree(a->devID, buf);
    if (indexA != NULL && indexB != NULL)
-        mem->ReleaseBuf(mem->devID, n * m * sizeof(int));
+        if (mem != NULL)
+            mem->ReleaseBuf(a->devID, n * m * sizeof(int));
+        else
+            XMemFree(a->devID, bufIndex);
    ProtectCudaDev(a->devID, devIDBackup);
 }

--- a/source/core/Sort.cuh
+++ b/source/core/Sort.cuh
@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* sort the tensor along a given dimension */
+extern "C"
 void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
 #endif // USE_CUDA

--- a/source/core/Sort.h
+++ b/source/core/Sort.h
@@ -22,7 +22,7 @@
 #ifndef __SORT_H__
 #define __SORT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/TopK.cpp
+++ b/source/core/TopK.cpp
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "TopK.h"
 #include "TopK.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 get the top-k items along a given dimension
 >> a - input tensor

--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XUtility.h"
+#include "../../XTensor.h"
 #include "TopK.h"
 #include "TopK.cuh"
 #include "Sort.cuh"
@@ -94,9 +95,6 @@ public:
    /* swap */
    __device__ void Swap(int i, int j)
    {
-        /*CudaHeapNode<T> tmp = items[i];
-        items[i] = items[j];
-        items[j] = tmp;*/
        int tmpIndex = items[i].index;
        T tmpValue = items[i].value;
        items[i] = items[j];
@@ -238,8 +236,10 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min
    if (threadIdx.x == 0) {
        CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x);
-        /* merge the result over the workers.
+        /* 
-        This can be improved by parallel merging */
+	merge the result over the workers.
+        This can be improved by parallel merging 
+	*/
        if (blockDim.x > 1) {
            for (int p = 1; p < blockDim.x && p < strideNum; p++) {
                CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p);
@@ -393,7 +393,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    int cudaGrids[3];
    int cudaBlocks[3];
-    GDevs.GetCudaThread2D(a->mem->devID,
+    GDevs.GetCudaThread2D(a->devID,
        workerNum, stride * blockNum, MAX_INT,
        cudaGrids, cudaBlocks);
@@ -434,13 +434,14 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        memcpy(dimSize, a->dimSize, sizeof(int) * a->order);
        dimSize[0] = -dimSize[0];
        XTensor * indexA = new XTensor(a->order, dimSize, X_INT, 1.0F, a->mem);
-        indexA->data = a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int));
+        indexA->data = a->mem != NULL ? a->mem->AllocBuf(a->devID, a->unitNum * sizeof(int)) : XMemAlloc(a->devID, a->unitNum * sizeof(int));
        /* make the index tensor */
        indexA->SetAscendingOrder(dim);
        CudaSortBig(a, b, indexA, index, dim, k);
+        if (a->mem != NULL)
            a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
        delete indexA;
    }

--- a/source/core/TopK.cuh
+++ b/source/core/TopK.cuh
--- a/source/core/TopK.h
+++ b/source/core/TopK.h
@@ -22,7 +22,7 @@
 #ifndef __TOPK_H__
 #define __TOPK_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/FlushToMem.cpp
+++ b/source/core/FlushToMem.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "FlushToMem.h"
 #include "FlushToMem.cuh"

--- a/source/core/FlushToMem.cu
+++ b/source/core/FlushToMem.cu
@@ -20,7 +20,7 @@
 */
 #include "FlushToMem.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -52,7 +52,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
        else
            reqiredSize = m->unitSize * m->unitNum;
-        //reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + size, reqiredSize);
        size += reqiredSize;
    }
@@ -70,7 +69,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
        else
            pSize = m->unitSize * m->unitNum;
-        //reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + p, pSize);
        reqiredSize = pSize;
        memcpy(data + p, m->data, pSize);

--- a/source/core/FlushToMem.cuh
+++ b/source/core/FlushToMem.cuh
@@ -22,7 +22,7 @@
 #ifndef __FLUSHTOMEM_CUH__
 #define __FLUSHTOMEM_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/FlushToMem.h
+++ b/source/core/FlushToMem.h
@@ -22,7 +22,7 @@
 #ifndef __FLUSHTOMEM_H__
 #define __FLUSHTOMEM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetAscendingOrder.cu
+++ b/source/core/SetAscendingOrder.cu
@@ -20,7 +20,7 @@
 */
 #include "SetAscendingOrder.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetAscendingOrder.cuh
+++ b/source/core/SetAscendingOrder.cuh
@@ -22,7 +22,7 @@
 #ifndef __SETASCENDINGORDER_CUH__
 #define __SETASCENDINGORDER_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XMatrixSegment.cpp
+++ b/source/core/XMatrixSegment.cpp
--- a/source/core/XMatrixSegment.h
+++ b/source/core/XMatrixSegment.h
@@ -22,13 +22,11 @@
 #ifndef __XMATRIXSEGMENT_H__
 #define __XMATRIXSEGMENT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/*******************************************************************
+/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
-segmentation and parallel processing for 2d tensors (i.e., matrices)
-*/
 /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
 extern "C"
 void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);

--- a/source/function/HardTanH.cpp
+++ b/source/function/HardTanH.cpp
@@ -25,7 +25,6 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
 hard tanh function 
 y =  1    if x > 1

--- a/source/function/HardTanH.cu
+++ b/source/function/HardTanH.cu
@@ -95,7 +95,6 @@ dy/dx = 1     if -1 <= x <= 1
 >> y - y of the function
 >> x - x of the function
 >> size - size of y/x
 */
 __global__ 
 void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)

--- a/source/function/Identity.cpp
+++ b/source/function/Identity.cpp
@@ -21,7 +21,7 @@
 #include "Identity.h"
 #include "../XUtility.h"
-#include "../core/CopyValues.h"
+#include "../core/movement/CopyValues.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
@@ -23,9 +23,9 @@
 #include "../XUtility.h"
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
-#include "../core/CopyValues.h"
+#include "../core/movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -49,7 +49,6 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
                dimSize[i - 1] = -x->dimSize[i];
        }
        XMem * mem = x->mem;
        XTensor * max = NULL;
        XTensor * sum = NULL;
@@ -168,7 +167,6 @@ dE/dx = dE/dy * dy/dx
 log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
  dy_i/dx_j 
 = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
 = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j

--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
@@ -22,8 +22,8 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
-#include "../core/ReduceSum.cuh"
+#include "../core/reduce/ReduceSum.cuh"
-#include "../core/ReduceMax.cuh"
+#include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -41,7 +41,8 @@ void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim)
    ShowNTErrors("You should call LogSoftmax instead!");
 }
-/* log softmax forward computation (Cuda kernel)
+/* 
+log softmax forward computation (Cuda kernel)
 for each column j, let y_{i,j} and x_{i,j} are the output
 and state value for the i-th element of column j. We have
@@ -85,7 +86,8 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    }
 }
-/* log softmax forward computation (Cuda kernel)
+/* 
+log softmax forward computation (Cuda kernel)
 for each row i, let y_{i,j} and x_{i,j} are the output
 and state value for the j-th element of row i. We have
@@ -182,7 +184,7 @@ void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, 
 /*
 set dE/dx = exp(y)
->> dedu - dE/dy
+>> dedy - dE/dy
 >> dedx - dE/dx
 >> y - output of the function
 >> size - size of output
@@ -256,7 +258,9 @@ dE/dx_j += -gold_j
 >> gold - gold standard to measure error (or loss)
 >> y - output of the function
 >> x - input of the function
->> size - size of input/output
+>> rowNum - row number of the matrix
+>> colNum - column number of the matrix
+>> gNonZeroNum - 
 >> lossName - name of the loss function
 */
 __global__
@@ -293,7 +297,6 @@ dE/dx = dE/dy * dy/dx
 log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
 dy_i/dx_j
 = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
 = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j

--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
@@ -374,15 +374,15 @@ void LossBackward(XTensor * dedy, XTensor * t, XTensor * y,
                  LOSS_FUNCTION_NAME LFName, 
                  int leadDim, int tBeg, int tLen, int yBeg)
 {
-    CheckNTErrors((tLen >= 0 && tLen < y->unitNum), "Illegal input length!");
+    CheckNTErrors((tLen < y->unitNum), "Illegal input length!");
    CheckNTErrors((XTensor::IsIdentical(t, y)&& XTensor::IsIdentical(dedy, y)), 
                        "The input tensors must be of the same size!");
-    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[1] == 1), "TODO!");
+    CheckNTErrors((t->dimSizeRDI[0] == 1 && y->dimSizeRDI[0] == 1 && dedy->dimSizeRDI[0] == 1), "TODO!");
    CheckNTErrors((t->order > leadDim && leadDim >= 0), "Illegal leading dimension!");
    CheckNTErrors((t->dataType == DEFAULT_DTYPE && y->dataType == DEFAULT_DTYPE),
                         "TODO!");
-    int leadDimRDI = y->order - leadDim - 1;
+    int leadDimRDI = leadDim >= 0 ? y->order - leadDim - 1 : -1;
    if(leadDimRDI < 0){
        leadDimRDI = y->dimSizeRDI[y->order - 1];
        tBeg = 0;

--- a/source/function/Loss.cu
+++ b/source/function/Loss.cu
@@ -31,7 +31,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 loss function to measure the "number" of errors
 */
 /* 
 compute the loss 
 >> gold - gold standard

--- a/source/function/Rectify.cu
+++ b/source/function/Rectify.cu
@@ -88,7 +88,6 @@ dy/dx =  1    if x >= 0
 >> y - output of the function
 >> x - input of the function
 >> size - size of output/input
 */
 __global__ 
 void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)

--- a/source/function/Sigmoid.cpp
+++ b/source/function/Sigmoid.cpp
@@ -25,7 +25,6 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /*
 sigmoid function y = 1/(1+exp(-x))
 >> x - input tensor

--- a/source/function/Sigmoid.cu
+++ b/source/function/Sigmoid.cu
@@ -95,7 +95,6 @@ sigmoid: y = 1/(1+exp(-x))
 >> y - output of the function
 >> x - input of the function
 >> size - size of output/input
 */
 __global__ 
 void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
@@ -122,7 +121,6 @@ sigmoid: y = 1/(1+exp(-x))
 >> dedy - dE/dy
 >> dedx - dE/dx
 >> lossName - type of loss function, e.g., cross entropy
 */
 void CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx,

--- a/source/function/Softmax.cpp
+++ b/source/function/Softmax.cpp
@@ -23,8 +23,8 @@
 #include "Softmax.h"
 #include "Softmax.cuh"
 #include "../XUtility.h"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
@@ -22,10 +22,10 @@
 #include "Softmax.h"
 #include "Softmax.cuh"
 #include "Loss.cuh"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/Multiply.h"
+#include "../core/arithmetic/Multiply.h"
-#include "../core/Unsqueeze.h"
+#include "../core/shape/Unsqueeze.h"
-#include "../core/Sum.h"
+#include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
 #include "../XUtility.h"

--- a/source/function/Softmax.cuh
+++ b/source/function/Softmax.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"
 void CudaSotmax(XTensor * input, XTensor * output, int leadDim);

--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
@@ -19,23 +19,20 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
+#include "TConcatenate.h"
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../core/Concatenate.h"
-#include "../XList.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2 * 1) -> (2 * 2), dim=1.
+/* 
+case 1: concatenate a list of tensors along a given dimension.
+In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenate1()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -45,7 +42,7 @@ bool TestConcatenate1()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -55,7 +52,7 @@ bool TestConcatenate1()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 2) */
+    /* a target tensor of size (2, 2) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -65,12 +62,12 @@ bool TestConcatenate1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][1] = { {2.0},
+    DTYPE sData2[2][1] = { {2.0F},
-                           {3.0} };
+                           {3.0F} };
-    DTYPE answer[2][2] = { {0.0, 2.0},
+    DTYPE answer[2][2] = { {0.0F, 2.0F},
-                           {1.0, 3.0} };
+                           {1.0F, 3.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -86,11 +83,11 @@ bool TestConcatenate1()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 1);
+    Concatenate(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -99,9 +96,6 @@ bool TestConcatenate1()
    /* GPU test */
    bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
    /* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -112,40 +106,56 @@ bool TestConcatenate1()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 1);
+	Concatenate(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: concatenate a list of tensors along a given dimension
+/* 
-* In this case, 2 * (2 * 1) -> (4 * 1), dim=0.
+case 2: concatenate a list of tensors along a given dimension.
+In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenate2()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -155,7 +165,7 @@ bool TestConcatenate2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -165,7 +175,7 @@ bool TestConcatenate2()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (4 * 1) */
+    /* a target tensor of size (4, 1) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 4;
@@ -175,14 +185,14 @@ bool TestConcatenate2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][1] = { {2.0},
+    DTYPE sData2[2][1] = { {2.0F},
-                           {3.0} };
+                           {3.0F} };
-    DTYPE answer[4][1] = { {0.0},
+    DTYPE answer[4][1] = { {0.0F},
-                           {1.0},
+                           {1.0F},
-                           {2.0},
+                           {2.0F},
-                           {3.0} };
+                           {3.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -198,11 +208,11 @@ bool TestConcatenate2()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 0);
+    Concatenate(sList, t, 0);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -211,9 +221,6 @@ bool TestConcatenate2()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -224,39 +231,56 @@ bool TestConcatenate2()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 0);
+	Concatenate(sList, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 3: concatenate a list of tensors along a given dimension
+/* 
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+case 3: concatenate a list of tensors along a given dimension.
+In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate3()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -266,7 +290,7 @@ bool TestConcatenate3()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -276,7 +300,7 @@ bool TestConcatenate3()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -286,12 +310,12 @@ bool TestConcatenate3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
-                           {4.0, 5.0} };
+                           {4.0F, 5.0F} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
-                           {1.0, 4.0, 5.0} };
+                           {1.0F, 4.0F, 5.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -307,11 +331,11 @@ bool TestConcatenate3()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenate function */
+    /* call Concatenate function */
-    Concatenate(&sList, t, 1);
+    Concatenate(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -320,9 +344,6 @@ bool TestConcatenate3()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -333,36 +354,53 @@ bool TestConcatenate3()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenate function */
+	/* call Concatenate function */
-	Concatenate(&sList, tGPU, 1);
+	Concatenate(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 4: concatenate two tensors along a given dimension
+/* 
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+case 4: concatenate two tensors along a given dimension.
+In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate4()
 {
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -372,7 +410,7 @@ bool TestConcatenate4()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -382,7 +420,7 @@ bool TestConcatenate4()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -392,12 +430,12 @@ bool TestConcatenate4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
-                           {4.0, 5.0} };
+                           {4.0F, 5.0F} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
-                           {1.0, 4.0, 5.0} };
+                           {1.0F, 4.0F, 5.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -412,7 +450,7 @@ bool TestConcatenate4()
    s2->SetData(sData2, sUnitNum2);
    t->SetZeroAll();
-    /* call concatenate function */
+    /* call Concatenate function */
    Concatenate(s1, s2, t, 1);
    /* check results */
@@ -432,21 +470,32 @@ bool TestConcatenate4()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call concatenate function */
+	/* call Concatenate function */
 	Concatenate(sGPU1, sGPU2, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -458,10 +507,9 @@ TODO!!
 */
 /* test for Concatenate Function */
-extern "C"
 bool TestConcatenate()
 {
-    XPRINT(0, stdout, "[TEST CONCATENATE] -------------\n");
+    XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TConcatenate.h
+++ b/source/test/TConcatenate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_CONCATENATE_H__
 #define __TEST_CONCATENATE_H__
-#include "../core/Concatenate.h"
+#include "../core/shape/Concatenate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
@@ -19,23 +19,21 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../core/ConcatenateSolely.h"
 #include "../XList.h"
+#include "TConcatenateSolely.h"
 namespace nts { // namespace nt(NiuTrans.Tensor)
-/* case 1: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2 * 1) -> (2 * 2), dim=1.
+/* 
+case 1: concatenate a list of tensors along a given dimension
+In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenateSolely1()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -45,7 +43,7 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -55,7 +53,7 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size 2 * 2 */
+    /* a target tensor of size (2, 2) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -65,12 +63,12 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][1] = { {2.0},
+    DTYPE sData2[2][1] = { {2.0F},
-                           {3.0} };
+                           {3.0F} };
-    DTYPE answer[2][2] = { {0.0, 2.0},
+    DTYPE answer[2][2] = { {0.0F, 2.0F},
-                           {1.0, 3.0} };
+                           {1.0F, 3.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -86,11 +84,11 @@ bool TestConcatenateSolely1()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 1);
+    ConcatenateSolely(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -99,9 +97,6 @@ bool TestConcatenateSolely1()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -112,23 +107,35 @@ bool TestConcatenateSolely1()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-	ConcatenateSolely(&sList, tGPU, 1);
+	ConcatenateSolely(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
+    delete sList;
    delete s1;
    delete s2;
    delete t;
@@ -140,15 +147,16 @@ bool TestConcatenateSolely1()
 #endif // USE_CUDA
    }
-/* case 2: concatenate a list of tensors along a given dimension
+/* 
-* In this case, 2 * (2 * 1) -> (4 * 1), dim=0.
+case 2: concatenate a list of tensors along a given dimension
+In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenateSolely2()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -158,7 +166,7 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size 2 * 1 */
+    /* a source tensor of size (2, 1) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -168,7 +176,7 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size 4 * 1 */
+    /* a target tensor of size (4, 1) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 4;
@@ -178,14 +186,14 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][1] = { {2.0},
+    DTYPE sData2[2][1] = { {2.0F},
-                           {3.0} };
+                           {3.0F} };
-    DTYPE answer[4][1] = { {0.0},
+    DTYPE answer[4][1] = { {0.0F},
-                           {1.0},
+                           {1.0F},
-                           {2.0},
+                           {2.0F},
-                           {3.0} };
+                           {3.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -201,11 +209,11 @@ bool TestConcatenateSolely2()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-    /* call concatenatesolely function */
+    /* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 0);
+    ConcatenateSolely(sList, t, 0);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -214,9 +222,6 @@ bool TestConcatenateSolely2()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -227,39 +232,56 @@ bool TestConcatenateSolely2()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
 	/* call concatenatesolely function */
-	ConcatenateSolely(&sList, tGPU, 0);
+	ConcatenateSolely(sList, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 3: concatenate a list of tensors along a given dimension
+/* 
-* In this case, (2 * 1) + (2 * 2) -> (2 * 3), dim=1.
+case 3: concatenate a list of tensors along a given dimension
+In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenateSolely3()
 {
 	/* create list */
-    XList sList;
+    XList * sList = new XList();
-    sList = XList();
-    /* a source tensor of size (2 * 1) */
+    /* a source tensor of size (2, 1) */
    int sOrder1 = 2;
    int * sDimSize1 = new int[sOrder1];
    sDimSize1[0] = 2;
@@ -269,7 +291,7 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2 * 2) */
+    /* a source tensor of size (2, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 2;
@@ -279,7 +301,7 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < sOrder2; i++)
        sUnitNum2 *= sDimSize2[i];
-    /* a target tensor of size (2 * 3) */
+    /* a target tensor of size (2, 3) */
    int tOrder = 2;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -289,12 +311,12 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][1] = { {0.0},
+    DTYPE sData1[2][1] = { {0.0F},
-                           {1.0} };
+                           {1.0F} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
-                           {4.0, 5.0} };
+                           {4.0F, 5.0F} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
-                           {1.0, 4.0, 5.0} };
+                           {1.0F, 4.0F, 5.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -310,11 +332,11 @@ bool TestConcatenateSolely3()
    t->SetZeroAll();
 	/* add tensors to list */
-    sList.Add(s1);
+    sList->Add(s1);
-    sList.Add(s2);
+    sList->Add(s2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-    ConcatenateSolely(&sList, t, 1);
+    ConcatenateSolely(sList, t, 1);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -323,9 +345,6 @@ bool TestConcatenateSolely3()
 	/* GPU test */
 	bool gpuTest = true;
-	/* clear list */
-	sList.Clear();
 	/* create tensor */
 	XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
 	XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
@@ -336,25 +355,41 @@ bool TestConcatenateSolely3()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
+	/* clear list */
+	sList->Clear();
 	/* add tensors to list*/
-	sList.Add(sGPU1);
+	sList->Add(sGPU1);
-	sList.Add(sGPU2);
+	sList->Add(sGPU2);
-	/* call concatenatesolely function */
+	/* call ConcatenateSolely function */
-	ConcatenateSolely(&sList, tGPU, 1);
+	ConcatenateSolely(sList, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete sList;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -366,10 +401,9 @@ TODO!!
 */
 /* test for ConcatenateSolely Function */
-extern "C"
 bool TestConcatenateSolely()
 {
-    XPRINT(0, stdout, "[TEST CONCATENATESOLELY] -------------\n");
+    XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TConcatenateSolely.h
+++ b/source/test/TConcatenateSolely.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_CONCATENATESOLELY_H__
 #define __TEST_CONCATENATESOLELY_H__
-#include "../core/ConcatenateSolely.h"
+#include "../core/shape/ConcatenateSolely.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TCopyIndexed.cpp
+++ b/source/test/TCopyIndexed.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TCopyIndexed.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1 copy indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, 
+srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
+*/
+bool TestCopyIndexed1()
+{
+    /* a input tensor of size (3, 2, 3) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    sDimSize[1] = 2;
+    sDimSize[2] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+    DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
+                                {2.0F, 3.0F} },
+                              { {1.0F, 4.0F}, 
+                                {3.0F, 2.0F}},
+                              { {-1.0F, 2.0F}, 
+                                {1.0F, 0.0F} } };
+    int dim = 2;
+    int indexSize = 2;
+    int srcIndex[2] = {0, 2};
+    int tgtIndex[2] = {0, 1};
+    int copyNum = 1;
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call CopyIndexed function */
+    CopyIndexed(s, t, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    /* call CopyIndexed function */
+    CopyIndexed(sGPU, tGPU, dim, srcIndex, indexSize, tgtIndex, copyNum);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for CopyIndexed Function */
+bool TestCopyIndexed()
+{
+    XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestCopyIndexed1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TCopyIndexed.h
+++ b/source/test/TCopyIndexed.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_COPYINDEXED_H__
+#define __TEST_COPYINDEXED_H__
+#include "../core/movement/CopyIndexed.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for CopyIndexed Function */
+extern "C"
+bool TestCopyIndexed();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_COPYINDEXED_H__
--- a/source/test/TCopyValues.cpp
+++ b/source/test/TCopyValues.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "../XUtility.h"
+#include "TCopyValues.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: copy tensor s to tensor t */
+bool TestCopyValues1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE scaleFactor = 2.0F;
+    DTYPE shiftFactor = 0.5F;
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call CopyValues function */
+    CopyValues(s, t);
+    /* check results */
+    cpuTest = t->CheckData(s->data, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetData(sData, sUnitNum);
+    /* call CopyValues function */
+    CopyValues(sGPU, tGPU);
+    /* check results */
+    DTYPE * dataGPU = (DTYPE*)sGPU->data;
+    int size = sUnitNum * sGPU->unitSize;
+    char * dataCPU = new char[size];
+    XMemCopy(dataCPU, -1, dataGPU, sGPU->devID, size);
+    gpuTest = tGPU->CheckData(dataCPU, sUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for CopyValues Function */
+bool TestCopyValues()
+{
+    XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestCopyValues1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TCopyValues.h
+++ b/source/test/TCopyValues.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_COPYVALUES_H__
+#define __TEST_COPYVALUES_H__
+#include "../core/movement/CopyValues.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for CopyValues Function */
+extern "C"
+bool TestCopyValues();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_COPYVALUES_H__
--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
@@ -19,16 +19,14 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
 */
+#include "THardTanH.h"
-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../function/HardTanH.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: hard tanh function */
 bool TestHardTanH1()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -38,7 +36,7 @@ bool TestHardTanH1()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -48,10 +46,10 @@ bool TestHardTanH1()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];
-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
-	                      {3.5, -4.5, 1.0} };
+	                      {3.5F, -4.5F, 1.0F} };
-	DTYPE answer[2][3] = { {0.5, -1.0, 1.0},
+	DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
-	                       {1.0, -1.0, 1.0} };
+	                       {1.0F, -1.0F, 1.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -68,7 +66,7 @@ bool TestHardTanH1()
 	HardTanH(x, y);
 	/* check results */
-	cpuTest = y->CheckData(answer, yUnitNum);
+	cpuTest = y->CheckData(answer, yUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -86,11 +84,15 @@ bool TestHardTanH1()
 	HardTanH(xGPU, yGPU);
 	/* check results */
-	gpuTest = yGPU->CheckData(answer, yUnitNum);
+	gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F);
 	/* destroy variables */
-	delete x, y, xGPU, yGPU;
+	delete x;
-	delete[] xDimSize, yDimSize;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+	delete[] xDimSize;
+    delete[] yDimSize;
 	return cpuTest && gpuTest;
 #else
@@ -104,12 +106,13 @@ bool TestHardTanH1()
 #endif // USE_CUDA
 }
-/* case 2: backward computation 
+/*
-* In this case, lossName=CROSSENTROPY.
+case 2: backward computation 
+In this case, lossName=CROSSENTROPY.
 */
 bool TestHardTanH2()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -119,7 +122,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -129,7 +132,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -139,7 +142,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -149,7 +152,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -159,16 +162,16 @@ bool TestHardTanH2()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
-	                      {3.5, -4.5, 1.0} };
+	                      {3.5F, -4.5F, 1.0F} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
-	                       {1.0, -1.0, 1.0} };
+	                       {1.0F, -1.0F, 1.0F} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
+	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
-	                         {1.0, 1.0, 1.0} };
+	                         {1.0F, 1.0F, 1.0F} };
-	DTYPE dedyData[2][3] = { {-2.0, 1.0, -1.0},
+	DTYPE dedyData[2][3] = { {-2.0F, 1.0F, -1.0F},
-	                         {-1.0, 1.0, -1.0} };
+	                         {-1.0F, 1.0F, -1.0F} };
-	DTYPE answer[2][3] = { {-2.0, 1.0, 0.0},
+	DTYPE answer[2][3] = { {-2.0F, 1.0F, 0.0F},
-	                       {0.0, 0.0, -1.0} };
+	                       {0.0F, 0.0F, -1.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -231,12 +234,13 @@ bool TestHardTanH2()
 #endif // USE_CUDA
 }
-/* case 3: backward computation
+/* 
-* In this case, lossName=SQUAREDERROR.
+case 3: backward computation
+In this case, lossName=SQUAREDERROR.
 */
 bool TestHardTanH3()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -246,7 +250,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -256,7 +260,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -266,7 +270,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -276,7 +280,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -286,16 +290,16 @@ bool TestHardTanH3()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
-	                      {3.5, -4.5, 1.0} };
+	                      {3.5F, -4.5F, 1.0F} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
-	                      {1.0, -1.0, 1.0} };
+	                      {1.0F, -1.0F, 1.0F} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
+	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
-	                         {1.0, 1.0, 1.0} };
+	                         {1.0F, 1.0F, 1.0F} };
-	DTYPE dedyData[2][3] = { {-0.5, -2.0, 0.0 },
+	DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F },
-	                         {0.0, -2.0, 0.0 } };
+	                         {0.0F, -2.0F, 0.0F } };
-	DTYPE answer[2][3] = { {-0.5, -2.0, 0.0},
+	DTYPE answer[2][3] = { {-0.5F, -2.0F, 0.0F},
-	                       {0.0, 0.0, 0.0} };
+	                       {0.0F, 0.0F, 0.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -358,12 +362,13 @@ bool TestHardTanH3()
 #endif // USE_CUDA
 }
-/* case 4: backward computation
+/* 
-* In this case, lossName=ONEHOTERROR.
+case 4: backward computation
+In this case, lossName=ONEHOTERROR.
 */
 bool TestHardTanH4()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -373,7 +378,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -383,7 +388,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -393,7 +398,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -403,7 +408,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -413,16 +418,16 @@ bool TestHardTanH4()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
-	                      {3.5, -4.5, 1.0} };
+	                      {3.5F, -4.5F, 1.0F} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
-	                      {1.0, -1.0, 1.0} };
+	                      {1.0F, -1.0F, 1.0F} };
-	DTYPE goldData[2][3] = { {1.0, 0.0, 1.0},
+	DTYPE goldData[2][3] = { {1.0F, 0.0F, 1.0F},
-	                         {0.0, 1.0, 1.0} };
+	                         {0.0F, 1.0F, 1.0F} };
-	DTYPE dedyData[2][3] = { {-0.5, 0.0, 0.0},
+	DTYPE dedyData[2][3] = { {-0.5F, 0.0F, 0.0F},
-	                         {0.0, -2.0, 0.0} };
+	                         {0.0F, -2.0F, 0.0F} };
-	DTYPE answer[2][3] = { {-0.5, 0.0, 0.0},
+	DTYPE answer[2][3] = { {-0.5F, 0.0F, 0.0F},
-	                       {0.0, 0.0, 0.0} };
+	                       {0.0F, 0.0F, 0.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -491,10 +496,9 @@ TODO!!
 */
 /* test for HardTanH Function */
-extern "C"
 bool TestHardTanH()
 {
-	XPRINT(0, stdout, "[TEST HARDTANH] -------------\n");
+	XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TIdentity.cpp
+++ b/source/test/TIdentity.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-29
+*/
+#include "../XUtility.h"
+#include "TIdentity.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: test Identity function.
+Identity function: y = x 
+*/
+bool TestIdentity1()
+{
+    /* a input tensor of size (2, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                          {0.5F, 0.7F, 1.4F} };
+    DTYPE answer[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                           {0.5F, 0.7F, 1.4F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    y->SetZeroAll();
+    /* call Identity function */
+    Identity(x, y);
+    /* check result */
+    cpuTest = y->CheckData(answer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    yGPU->SetZeroAll();
+    /* call Identity function */
+    Identity(xGPU, yGPU);
+    /* check result */
+    gpuTest = yGPU->CheckData(answer, sUnitNum);
+    /* destroy variables */
+    delete x, y;
+    delete xGPU, yGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x, y;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test IdentityBackward function.
+IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
+*/
+bool TestIdentity2()
+{
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 1;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
+    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
+    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * g = NewTensor(sOrder, sDimSize);
+    XTensor * dedy = NewTensor(sOrder, sDimSize);
+    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    g->SetData(gData, sUnitNum);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    dedy->SetZeroAll();
+    /* call Identity function */
+    Identity(x, y);
+    /* call IdentityBackward function */
+    IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
+    /* check result */
+    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+        /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    gGPU->SetData(gData, sUnitNum);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
+    /* call Identity function */
+    Identity(xGPU, yGPU);
+    /* call IdentityBackward function */
+    IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+    /* check result */
+    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for Identity Function */
+bool TestIdentity()
+{
+    XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestIdentity1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestIdentity2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TIdentity.h
+++ b/source/test/TIdentity.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-29
+*/
+#ifndef __TEST_IDENTITY_H__
+#define __TEST_IDENTITY_H__
+#include "../function/Identity.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Identity Function */
+extern "C"
+bool TestIdentity();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_IDENTITY_H__
--- a/source/test/TLogSoftmax.cpp
+++ b/source/test/TLogSoftmax.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
+*/
+#include "../XUtility.h"
+#include "TLogSoftmax.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: test LogSoftmax function.
+LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
+*/
+bool TestLogSoftmax1()
+{
+    /* a input tensor of size (2, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                          {0.5F, 0.7F, 1.4F} };
+    DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F}, 
+                           {-1.5435F, -1.3435F, -0.6435F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    y->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 1);
+    /* check result */
+    cpuTest = y->CheckData(answer, sUnitNum, 1e-4F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    yGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 1);
+    /* check result */
+    gpuTest = yGPU->CheckData(answer, sUnitNum, 1e-4F);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete z;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test LogSoftmaxBackward function.
+dE/dx = dE/dy * dy/dx
+log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+*/
+bool TestLogSoftmax2()
+{
+    /* a input tensor of size (3) */
+    int sOrder = 1;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
+    DTYPE gData[3] = {0.5F, 0.8F, 1.5F};
+    DTYPE yAnswer[3] = {-2.4076F, -1.4076F, -0.4076F};
+    DTYPE dedxAnswer[3] = {-0.409969F, -0.555272F, -0.834759F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * g = NewTensor(sOrder, sDimSize);
+    XTensor * dedy = NewTensor(sOrder, sDimSize);
+    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    g->SetData(gData, sUnitNum);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    dedy->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 0);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY);
+    /* check result */
+    cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    gGPU->SetData(gData, sUnitNum);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 0);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY);
+    /* check result */
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 3: test LogSoftmaxBackward function.
+dE/dx = dE/dy * dy/dx
+log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+*/
+bool TestLogSoftmax3()
+{
+    /* a tensor of size (1, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 1;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
+    DTYPE gData[1][3] = { {0.5F, 0.8F, 1.5F} };
+    DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
+    DTYPE dedxAnswer[1][3] = {-0.409969F, -0.555272F, -0.834759F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * x = NewTensor(sOrder, sDimSize);
+    XTensor * y = NewTensor(sOrder, sDimSize);
+    XTensor * g = NewTensor(sOrder, sDimSize);
+    XTensor * dedy = NewTensor(sOrder, sDimSize);
+    XTensor * dedx = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    x->SetData(xData, sUnitNum);
+    g->SetData(gData, sUnitNum);
+    y->SetZeroAll();
+    dedx->SetZeroAll();
+    dedy->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(x, y, 1);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
+    /* check result */
+    cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * xGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * yGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * gGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedyGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * dedxGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    xGPU->SetData(xData, sUnitNum);
+    gGPU->SetData(gData, sUnitNum);
+    yGPU->SetZeroAll();
+    dedxGPU->SetZeroAll();
+    dedyGPU->SetZeroAll();
+    /* call LogSoftmax function */
+    LogSoftmax(xGPU, yGPU, 1);
+    /* call LogSoftmaxBackward function */
+    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
+    /* check result */
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for LogSoftmax Function */
+bool TestLogSoftmax()
+{
+    XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestLogSoftmax1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestLogSoftmax2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* case 3 test */
+    caseFlag = TestLogSoftmax3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TLogSoftmax.h
+++ b/source/test/TLogSoftmax.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-02
+*/
+#ifndef __TEST_LOGSOFTMAX_H__
+#define __TEST_LOGSOFTMAX_H__
+#include "../function/LogSoftmax.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for LogSoftmax Function */
+extern "C"
+bool TestLogSoftmax();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_LOGSOFTMAX_H__
--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
@@ -19,91 +19,244 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "../core/math/ScaleAndShift.h"
-#include "../XDevice.h"
 #include "../function/Loss.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
-namespace nts { // namespace nt(NiuTrans.Tensor)
+/* 
-/* case 1 */
+case 1: test LossCompute function 
+In this case, Loss function name = SQUAREDERROR.
+loss = sum_{i} 0.5*(t_i - y_i)^2, 
+where t_i is the gold standard and y_i is the model output
+*/
 bool TestLoss1()
 {
-    /* a tensor of size 10000 * 1 */
+    /* a tensor of size (10, 1) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 10;
+    dimSize[1] = 1;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    DTYPE answer = 5.0F;
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    /* initialize variables */
+    output->SetZeroAll();
+    gold->SetZeroAll();
+    ScaleAndShift(output, 1, 1);
+    ScaleAndShift(gold, 1, 2);
+    DTYPE error;
+    error = LossCompute(gold, output, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    cpuTest = (error == answer);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    outputGPU->SetZeroAll();
+    goldGPU->SetZeroAll();
+    ScaleAndShift(outputGPU, 1, 1);
+    ScaleAndShift(goldGPU, 1, 2);
+    /* call LossCompute function */
+    error = LossCompute(goldGPU, outputGPU, SQUAREDERROR, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    gpuTest = (error == answer);
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test LossCompute function 
+In this case, Loss function name = CROSSENTROPY.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output
+*/
+bool TestLoss2()
+{
+    /* a tensor of size (10, 1) */
+    int order = 2;
+    int * dimSize = new int[order];
+    dimSize[0] = 10;
+    dimSize[1] = 1;
+    int unitNum = 1;
+    for (int i = 0; i < order; i++)
+        unitNum *= dimSize[i];
+    /* CPU test */
+    bool cpuTest = true;
+    DTYPE answer = 0.0F;
+    /* create tensors */
+    XTensor * output = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
+    /* initialize variables */
+    output->SetZeroAll();
+    gold->SetZeroAll();
+    ScaleAndShift(output, 1, 1);
+    ScaleAndShift(gold, 1, 2);
+    DTYPE error;
+    error = LossCompute(gold, output, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    cpuTest = (error == answer);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    outputGPU->SetZeroAll();
+    goldGPU->SetZeroAll();
+    ScaleAndShift(outputGPU, 1, 1);
+    ScaleAndShift(goldGPU, 1, 2);
+    /* call LossCompute function */
+    error = LossCompute(goldGPU, outputGPU, CROSSENTROPY, false, 0, 0, dimSize[0], 0);
+    /* check results */
+    gpuTest = (error == answer);
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
+    delete[] dimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete output;
+    delete gold;
+    delete[] dimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 3: test LossCompute function 
+In this case, Loss function name = ONEHOTERROR.
+loss = sum_{i} e_i
+where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
+*/
+bool TestLoss3()
+{
+    /* a tensor of size (10, 1) */
    int order = 2;
-    int order_reduce = 1;
    int * dimSize = new int[order];
-    dimSize[0] = 10000;
+    dimSize[0] = 5;
    dimSize[1] = 1;
    int unitNum = 1;
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
+    DTYPE outputData[5][1] = { {0.5F},
+                               {0.5F},
+                               {0.5F},
+                               {0.5F},
+                               {0.5F} };
+    DTYPE goldData[5][1] = { {1.0F},
+                             {1.0F},
+                             {0.0F},
+                             {0.0F},
+                             {0.0F} };
    /* CPU test */
    bool cpuTest = true;
+    DTYPE answer = 0.25F;
    /* create tensors */
-    XTensor * a = NewTensor(order, dimSize);
+    XTensor * output = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * gold = NewTensor(order, dimSize);
    /* initialize variables */
-    DTYPE* a_data = (DTYPE*)a->data;
+    output->SetData(outputData, unitNum);
-    for (int i = 0; i < unitNum; i++)
+    gold->SetData(goldData, unitNum);
-        *a_data++ = 1;
-    DTYPE* b_data = (DTYPE*)b->data;
+    DTYPE error;
-    for (int i = 0; i < unitNum; i++)
+    error = LossCompute(gold, output, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
-        *b_data++ = 1;
-    DTYPE error = 0.0F;
-    error = LossCompute(a, b, SQUAREDERROR, false, 1, 0, dimSize[0], 0);
-    printf("%d", error);
-    /* call reduce max function */
-    //ReduceMax(a, reduce_a, 0);
-    //ReduceMax(b, reduce_b, 1);
-    //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
-    //for (int i = 0; i < unitNum_a; i++)
-    //    printf("%f ", *reduce_a_data++);
-    //printf("\n");
-    //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-    //for (int i = 0; i < unitNum_b; i++)
-    //    printf("%f ", *reduce_b_data++);
    /* check results */
-    cpuTest = true;
+    cpuTest = (error == answer);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * outputGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * goldGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
    /* Initialize variables */
-    DTYPE* aGPU_data = (DTYPE*)aGPU->data;
+    outputGPU->SetData(outputData, unitNum);
-    for (int i = 0; i < unitNum; i++)
+    goldGPU->SetData(goldData, unitNum);
-        *aGPU_data++ = 1;
-    DTYPE* bGPU_data = (DTYPE*)bGPU->data;
+    /* call LossCompute function */
-    for (int i = 0; i < unitNum; i++)
+    error = LossCompute(goldGPU, outputGPU, ONEHOTERROR, false, 0, 0, dimSize[0], 0);
-        *bGPU_data++ = 1;
-    error = LossCompute(a, b, SQUAREDERROR, false, 1, 0, dimSize[0], 0);
-    printf("%d", error);
-    /* call reduce max function */
-    //ReduceMax(aGPU, reduce_aGPU, 0);
-    //ReduceMax(bGPU, reduce_bGPU, 1);
    /* check results */
-    gpuTest = true;
+    gpuTest = (error == answer);
    /* destroy variables */
-    delete aGPU, bGPU;
+    delete output;
+    delete gold;
+    delete outputGPU;
+    delete goldGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete a;
+    delete output;
-    delete b;
+    delete gold;
+    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
@@ -113,11 +266,10 @@ bool TestLoss1()
 TODO!!
 */
-/* test for Sum Function */
+/* test for Loss Function */
-extern "C"
+bool TestLoss()
-    bool TestLoss()
 {
-    XPRINT(0, stdout, "[TEST Loss]\n");
+    XPRINT(0, stdout, "[TEST Loss] compute the loss \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -129,6 +281,23 @@ extern "C"
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestLoss2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    caseFlag = TestLoss3();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 3 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 3 passed!\n");
    ///* other cases test */
    ///*
    //TODO!!
@@ -145,4 +314,4 @@ extern "C"
    return returnFlag;
 }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TLoss.h
+++ b/source/test/TLoss.h
@@ -26,9 +26,9 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for Sum Function */
+/* test for Loss Function */
 extern "C"
 bool TestLoss();
 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUM_H__
+#endif // __TEST_LOSS_H__
--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
@@ -19,14 +19,13 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
 */
-#include "../XTensor.h"
 #include "TMatrixMULBatchedCPU.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication in batch mode (CPU code). 
-* In this case, aList=2*(2, 3), bList=2*(2, 3) -> c=2*(2, 2), 
+/* 
-  transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+case 1: matrix multiplication in batch mode (CPU code). 
+In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatchedCPU1()
 {
@@ -65,20 +64,20 @@ bool TestMatrixMulBatchedCPU1()
    for (int i = 0; i < cOrder; i++)
        cUnitNum *= cDimSize[i];
-    DTYPE aData1[2][3] = { {1.0, 2.0, 3.0},
+    DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0, 5.0, 6.0} };
+                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE aData2[2][3] = { {1.0, -2.0, -3.0},
+    DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
-                           {-4.0, 3.0, 2.0} };
+                           {-4.0F, 3.0F, 2.0F} };
-    DTYPE bData1[3][2] = { {0.0, -1.0},
+    DTYPE bData1[3][2] = { {0.0F, -1.0F},
-                           {1.0, 2.0}, 
+                           {1.0F, 2.0F}, 
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE bData2[3][2] = { {0.0, 1.0},
+    DTYPE bData2[3][2] = { {0.0F, 1.0F},
-                           {3.0, 2.0}, 
+                           {3.0F, 2.0F}, 
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE answer1[2][2] = { {8.0, 6.0}, 
+    DTYPE answer1[2][2] = { {8.0F, 6.0F}, 
-                            {17.0, 20.0} };
+                            {17.0F, 20.0F} };
-    DTYPE answer2[2][2] = { {-12.0, -6.0}, 
+    DTYPE answer2[2][2] = { {-12.0F, -6.0F}, 
-                            {13.0, 4.0} };
+                            {13.0F, 4.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -111,18 +110,12 @@ bool TestMatrixMulBatchedCPU1()
    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
    /* check results */
-    cpuTest = c1->CheckData(answer1, cUnitNum) && cpuTest;
+    cpuTest = c1->CheckData(answer1, cUnitNum) && c2->CheckData(answer2, cUnitNum);
-    cpuTest = c2->CheckData(answer2, cUnitNum) && cpuTest;
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
-    /* clear list */
-    aList->Clear();
-    bList->Clear();
-    cList->Clear();
    /* create tensors */
    XTensor * aGPU1 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
    XTensor * aGPU2 = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
@@ -139,31 +132,55 @@ bool TestMatrixMulBatchedCPU1()
    cGPU1->SetZeroAll();
    cGPU2->SetZeroAll();
+    /* clear list */
+    aList->Clear();
+    bList->Clear();
+    cList->Clear();
    /* add tensors to list */
-    aList->Add(a1);
+    aList->Add(aGPU1);
-    aList->Add(a2);
+    aList->Add(aGPU2);
-    bList->Add(b1);
+    bList->Add(bGPU1);
-    bList->Add(b2);
+    bList->Add(bGPU2);
-    cList->Add(c1);
+    cList->Add(cGPU1);
-    cList->Add(c2);
+    cList->Add(cGPU2);
    /* call MatrixMULBatchedCPU function */
    MatrixMULBatchedCPU(aList, X_NOTRANS, bList, X_NOTRANS, cList);
    /* check results */
-    gpuTest = c1->CheckData(answer1, cUnitNum) && gpuTest;
+    gpuTest = cGPU1->CheckData(answer1, cUnitNum) && gpuTest;
-    gpuTest = c2->CheckData(answer2, cUnitNum) && gpuTest;
+    gpuTest = cGPU2->CheckData(answer2, cUnitNum) && gpuTest;
    /* destroy variables */
-    delete a1, a2, b1, b2, c1, c2;
+    delete a1;
-    delete aGPU1, aGPU2, bGPU1, bGPU2, cGPU1, cGPU2;
+    delete a2;
-    delete[] aDimSize, bDimSize, cDimSize;
+    delete b1;
+    delete b2;
+    delete c1;
+    delete c2;
+    delete aGPU1;
+    delete aGPU2;
+    delete bGPU1;
+    delete bGPU2;
+    delete cGPU1;
+    delete cGPU2;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete a1, a2, b1, b2, c1, c2;
+    delete a1;
-    delete[] aDimSize, bDimSize, cDimSize;
+    delete a2;
+    delete b1;
+    delete b2;
+    delete c1;
+    delete c2;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -178,7 +195,7 @@ bool TestMatrixMulBatchedCPU1()
 extern "C"
 bool TestMatrixMulBatchedCPU()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMULBATCHEDCPU] matrix multiplication in batch mode (CPU code) \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -191,15 +208,6 @@ bool TestMatrixMulBatchedCPU()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-    ///* case 2 test */
-    //caseFlag = TestMatrixMulBatchedCPU2();
-    //if (!caseFlag) {
-    //    returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 2 failed!\n");
-    //}
-    //else
-    //    XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TMatrixMULBatchedCPU.h
+++ b/source/test/TMatrixMULBatchedCPU.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMULBATCHEDCPU_H__
 #define __TEST_MATRIXMULBATCHEDCPU_H__
-#include "../core/MatrixMULBatchedCPU.h"
+#include "../core/arithmetic/MatrixMULBatchedCPU.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
@@ -19,14 +19,14 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-14
 */
-#include "../XTensor.h"
 #include "TMatrixMul.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication. 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS,
+/* 
-  transposedB=X_NOTRANS.
+case 1: matrix multiplication. 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul1()
 {
@@ -60,13 +60,13 @@ bool TestMatrixMul1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0, 5.0, 6.0} };
+                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
-                           {1.0, 2.0}, 
+                           {1.0F, 2.0F}, 
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
-                           {17.0, 20.0} };
+                           {17.0F, 20.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -108,22 +108,34 @@ bool TestMatrixMul1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: matrix multiplication. 
+/* 
-* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), transposedA=X_TRANS,
+case 2: matrix multiplication. 
-  transposedB=X_NOTRANS.
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+ transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2()
 {
@@ -137,7 +149,7 @@ bool TestMatrixMul2()
    for (int i = 0; i < sOrder1; i++)
        sUnitNum1 *= sDimSize1[i];
-    /* a source tensor of size (2, 3) */
+    /* a source tensor of size (3, 2) */
    int sOrder2 = 2;
    int * sDimSize2 = new int[sOrder2];
    sDimSize2[0] = 3;
@@ -157,14 +169,14 @@ bool TestMatrixMul2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2] = { {1.0, -4.0},
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
-                           {2.0, 5.0},
+                           {2.0F, 5.0F},
-                           {3.0, 6.0} };
+                           {3.0F, 6.0F} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
-                           {1.0, 2.0},
+                           {1.0F, 2.0F},
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
-                           {17.0, 20.0} };
+                           {17.0F, 20.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -206,22 +218,34 @@ bool TestMatrixMul2()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 3: matrix multiplication. 
+/* 
-* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), transposedA=X_NOTRANS,
+case 3: matrix multiplication. 
-  transposedB=X_NOTRANS.
+In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul3()
 {
@@ -259,20 +283,30 @@ bool TestMatrixMul3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
-                                {2.0, 1.0, 3.0} },
+                                {2.0F, 1.0F, 3.0F} },
-                              { {1.0, 2.0, 4.0}, 
+                              { {1.0F, 2.0F, 4.0F}, 
-                                {3.0, 1.0, 2.0}},
+                                {3.0F, 1.0F, 2.0F}},
-                              { {-1.0, 3.0, 2.0}, 
+                              { {-1.0F, 3.0F, 2.0F}, 
-                                {1.0, -1.0, 0.0} } };
+                                {1.0F, -1.0F, 0.0F} } };
-    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
+    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
-                                {-4.0, 3.0},
+                                {-4.0F, 3.0F},
-                                {2.0, 6.0} },
+                                {2.0F, 6.0F} },
-                              { {1.0, 2.0},
+                              { {1.0F, 2.0F},
-                                {-4.0, 3.0},
+                                {3.0F, 4.0F},
-                                {2.0, 6.0} } };
+                                {5.0F, 6.0F} } };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
+    DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F}, 
-                           {17.0, 20.0} };
+                                     {4.0F, 25.0F} },
+                                   { {7.0F, 8.0F},
+                                     {20.0F, 26.0F} } },
+                                 { { {1.0F, 32.0F},
+                                     {3.0F, 21.0F} },
+                                   { {27.0F, 34.0F}, 
+                                     {16.0F, 22.0F} } },
+                                 { { {-9.0F, 19.0F},
+                                     {5.0F, -1.0F} },
+                                   { {18.0F, 22.0F}, 
+                                     {-2.0F, -2.0F} } } };
    /* CPU test */
    bool cpuTest = true;
@@ -290,17 +324,124 @@ bool TestMatrixMul3()
    /* call MatrixMul function */
    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
-    XPRINT(0, stdout, "\ntarget data\n[");
+    /* check results */
-	DTYPE* check_data = (DTYPE*)t->data;
+    cpuTest = t->CheckData(answer, tUnitNum);
-	for (int i = 0; i < tUnitNum; i++)
-		printf("%f ", *check_data++);
-	printf("]\n");
-    int * size = new int(tOrder);
+#ifdef USE_CUDA
-    size = t->dimSize;
+    /* GPU test */
-    for (int i = 0; i < tOrder; i++) {
+    bool gpuTest = true;
-        printf("size %d: %d\n", i, *size++);
-    }
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+    /* call MatrixMul function */
+    MatrixMul(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 4: matrix multiplication. 
+In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul4()
+{
+    /* a source tensor of size (3, 2, 3) */
+    int sOrder1 = 3;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 3;
+    sDimSize1[1] = 2;
+    sDimSize1[2] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (3, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 3;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F}},
+                              { {-1.0F, 3.0F, 2.0F}, 
+                                {1.0F, -1.0F, 0.0F} } };
+    DTYPE sData2[3][2] = { {1.0F, 2.0F},
+                           {3.0F, 4.0F},
+                           {5.0F, 6.0F} };
+    DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
+                                {20.0F, 26.0F} },
+                              { {27.0F, 34.0F}, 
+                                 {16.0F, 22.0F} },
+                              { {18.0F, 22.0F}, 
+                                {-2.0F, -2.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMul function */
+    MatrixMul(s1, X_NOTRANS, s2, X_NOTRANS, t);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -326,14 +467,25 @@ bool TestMatrixMul3()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -346,10 +498,9 @@ bool TestMatrixMul3()
 */
 /* test for MatrixMul Function */
-extern "C"
 bool TestMatrixMul()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMUL] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -371,14 +522,23 @@ bool TestMatrixMul()
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
-    ///* case 3 test */
+    /* case 3 test */
-    //caseFlag = TestMatrixMul3();
+    caseFlag = TestMatrixMul3();
-    //if (!caseFlag) {
+    if (!caseFlag) {
-    //    returnFlag = false;
+        returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 3 failed!\n");
+        XPRINT(0, stdout, ">> case 3 failed!\n");
-    //}
+    }
-    //else
+    else
-    //    XPRINT(0, stdout, ">> case 3 passed!\n");
+        XPRINT(0, stdout, ">> case 3 passed!\n");
+    /* case 4 test */
+    caseFlag = TestMatrixMul4();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 4 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 4 passed!\n");
    /* other cases test */
    /*

--- a/source/test/TMatrixMul.h
+++ b/source/test/TMatrixMul.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMUL_H__
 #define __TEST_MATRIXMUL_H__
-#include "../core/MatrixMul.h"
+#include "../core/arithmetic/MatrixMul.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
@@ -19,14 +19,14 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
 */
-#include "../XTensor.h"
 #include "TMatrixMul2D.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication (for 2d tensors). 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), transposedA=X_NOTRANS,
+/* 
-  transposedB=X_NOTRANS.
+case 1: matrix multiplication (for 2d tensors). 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D1()
 {
@@ -60,13 +60,13 @@ bool TestMatrixMul2D1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
-                           {-4.0, 5.0, 6.0} };
+                           {-4.0F, 5.0F, 6.0F} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
-                           {1.0, 2.0}, 
+                           {1.0F, 2.0F}, 
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
-                           {17.0, 20.0} };
+                           {17.0F, 20.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -108,22 +108,34 @@ bool TestMatrixMul2D1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: matrix multiplication (for 2d tensors). 
+/* 
-* In this case, a=(3, 2), b=(2, 3) -> c=(2, 2), transposedA=X_TRANS,
+case 2: matrix multiplication (for 2d tensors).
-  transposedB=X_NOTRANS.
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D2()
 {
@@ -157,14 +169,14 @@ bool TestMatrixMul2D2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData1[3][2] = { {1.0, -4.0},
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
-                           {2.0, 5.0},
+                           {2.0F, 5.0F},
-                           {3.0, 6.0} };
+                           {3.0F, 6.0F} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
-                           {1.0, 2.0},
+                           {1.0F, 2.0F},
-                           {2.0, 1.0} };
+                           {2.0F, 1.0F} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
-                           {17.0, 20.0} };
+                           {17.0F, 20.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -206,14 +218,25 @@ bool TestMatrixMul2D2()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
@@ -228,7 +251,7 @@ bool TestMatrixMul2D2()
 extern "C"
 bool TestMatrixMul2D()
 {
-    XPRINT(0, stdout, "[TEST MATRIXMUL2D] -------------\n");
+    XPRINT(0, stdout, "[TEST MATRIXMUL2D] matrix multiplication (for 2d tensors) \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TMatrixMul2D.h
+++ b/source/test/TMatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMUL2D_H__
 #define __TEST_MATRIXMUL2D_H__
-#include "../core/MatrixMul2D.h"
+#include "../core/arithmetic/MatrixMul2D.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul2DParallel.cpp
+++ b/source/test/TMatrixMul2DParallel.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TMatrixMul2DParallel.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: matrix multiplication (for 2d tensors) with multi-threading. 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul2DParallel1()
+{
+    /* a source tensor of size (2, 3) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMul2DParallel function */
+    MatrixMul2DParallel(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+}
+/* 
+case 2: matrix multiplication (for 2d tensors) with multi-threading.
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+transposedA=X_TRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMul2DParallel2()
+{
+    /* a source tensor of size (3, 2) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 3;
+    sDimSize1[1] = 2;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+                           {2.0F, 5.0F},
+                           {3.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F},
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
+                           {17.0F, 20.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMul2DParallel function */
+    MatrixMul2DParallel(s1, X_TRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for MatrixMul2DParallel Function */
+bool TestMatrixMul2DParallel()
+{
+    XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestMatrixMul2DParallel1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestMatrixMul2DParallel2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TMatrixMul2DParallel.h
+++ b/source/test/TMatrixMul2DParallel.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_MATRIXMUL2DPARALLEL_H__
+#define __TEST_MATRIXMUL2DPARALLEL_H__
+#include "../core/arithmetic/MatrixMul2DParallel.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for MatrixMul2DParallel Function */
+extern "C"
+bool TestMatrixMul2DParallel();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_MATRIXMUL2DPARALLEL_H__
--- a/source/test/TMatrixMulBatched.cpp
+++ b/source/test/TMatrixMulBatched.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
+*/
+#include "TMatrixMULBatched.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: matrix multiplication of the two tensors. 
+In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMulBatched1()
+{
+    /* a source tensor of size (2, 3) */
+    int sOrder1 = 2;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (3, 2) */
+    int sOrder2 = 2;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 3;
+    sDimSize2[1] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/*
+case 2: matrix multiplication of the two tensors. 
+In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+*/
+bool TestMatrixMulBatched2()
+{
+    /* a source tensor of size (2, 2, 3) */
+    int sOrder1 = 3;
+    int * sDimSize1 = new int[sOrder1];
+    sDimSize1[0] = 2;
+    sDimSize1[1] = 2;
+    sDimSize1[2] = 3;
+    int sUnitNum1 = 1;
+    for (int i = 0; i < sOrder1; i++)
+        sUnitNum1 *= sDimSize1[i];
+    /* a source tensor of size (2, 3, 2) */
+    int sOrder2 = 3;
+    int * sDimSize2 = new int[sOrder2];
+    sDimSize2[0] = 2;
+    sDimSize2[1] = 3;
+    sDimSize2[2] = 2;
+    int sUnitNum2 = 1;
+    for (int i = 0; i < sOrder2; i++)
+        sUnitNum2 *= sDimSize2[i];
+    /* a target tensor of size (2, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData1[2][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F} } };
+    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
+                                {-4.0F, 3.0F},
+                                {2.0F, 6.0F} },
+                              { {1.0F, 2.0F},
+                                {3.0F, 4.0F},
+                                {5.0F, 6.0F} } };
+    DTYPE answer[2][2][2] = { { {8.0F, 9.0F}, 
+                                {4.0F, 25.0F} },
+                              { {27.0F, 34.0F}, 
+                                {16.0F, 22.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s1 = NewTensor(sOrder1, sDimSize1);
+    XTensor * s2 = NewTensor(sOrder2, sDimSize2);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s1->SetData(sData1, sUnitNum1);
+    s2->SetData(sData2, sUnitNum2);
+    t->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(s1, X_NOTRANS, s2, X_NOTRANS, t);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU1 = NewTensor(sOrder1, sDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * sGPU2 = NewTensor(sOrder2, sDimSize2, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU1->SetData(sData1, sUnitNum1);
+    sGPU2->SetData(sData2, sUnitNum2);
+    tGPU->SetZeroAll();
+    /* call MatrixMulBatched function */
+    MatrixMulBatched(sGPU1, X_NOTRANS, sGPU2, X_NOTRANS, tGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s1;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for TestMatrixMulBatched Function */
+bool TestMatrixMulBatched()
+{
+    XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestMatrixMulBatched1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestMatrixMulBatched2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TMatrixMulBatched.h
+++ b/source/test/TMatrixMulBatched.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-15
+*/
+#ifndef __TEST_MATRIXMULBATCHED_H__
+#define __TEST_MATRIXMULBATCHED_H__
+#include "../core/arithmetic/MatrixMulBatched.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for MatrixMulBatched Function */
+extern "C"
+bool TestMatrixMulBatched();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_MATRIXMULBATCHED_H__
--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
@@ -19,14 +19,15 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13
 */
 #include "../XTensor.h"
 #include "../XList.h"
 #include "TMerge.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: transform a tensor by merging it along with a dimension. 
-* In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
+/* 
+case 1: transform a tensor by merging it along with a dimension. 
+In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
 */
 bool TestMerge1()
 {
@@ -49,9 +50,9 @@ bool TestMerge1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
+    DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
-                          {3.0, 4.0, 5.0} };
+                          {3.0F, 4.0F, 5.0F} };
-    DTYPE answer[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+    DTYPE answer[6] = {0.0F, 1.0F, 2.0F, 3.0F, 4.0F, 5.0F};
    /* CPU test */
    bool cpuTest = true;
@@ -89,21 +90,30 @@ bool TestMerge1()
    gpuTest = tGPU->CheckData(answer, tUnitNum);
    /* destroy variables */
-    delete s, t, sGPU, tGPU;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: transform a tensor by merging it along with a dimension. 
+/* 
-* In this case, (2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
+case 2: transform a tensor by merging it along with a dimension. 
+In this case, 
+(2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
+(2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0.
 */
 bool TestMerge2()
 {
@@ -119,125 +129,55 @@ bool TestMerge2()
        sUnitNum *= sDimSize[i];
    /* a target tensor of size (4, 3) */
-    int tOrder = 2;
+    int tOrder1 = 2;
-    int * tDimSize = new int[tOrder];
+    int * tDimSize1 = new int[tOrder1];
-    tDimSize[0] = 4;
+    tDimSize1[0] = 4;
-    tDimSize[1] = 3;
+    tDimSize1[1] = 3;
-    int tUnitNum = 1;
+    int tUnitNum1 = 1;
-    for (int i = 0; i < tOrder; i++)
+    for (int i = 0; i < tOrder1; i++)
-        tUnitNum *= tDimSize[i];
+        tUnitNum1 *= tDimSize1[i];
-    DTYPE sData[2][2][3] = { { {0.0, 1.0, 2.0},
+    /* a target tensor of size (2, 6) */
-                               {4.0, 5.0, 6.0} },
+    int tOrder2 = 2;
-                             { {-1.0, 2.0, 3.0},
+    int * tDimSize2 = new int[tOrder2];
-                               {-4.0, -5.0, -6.0} } };
+    tDimSize2[0] = 2;
-    DTYPE answer[4][3] = { {0.0, 1.0, 2.0},
+    tDimSize2[1] = 6;
-                           {4.0, 5.0, 6.0},
-                           {-1.0, 2.0, 3.0},
+    int tUnitNum2 = 1;
-                           {-4.0, -5.0, -6.0} };
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
-    /* CPU test */
-    bool cpuTest = true;
+    DTYPE sData[2][2][3] = { { {0.0F, 1.0F, 2.0F},
+                               {4.0F, 5.0F, 6.0F} },
-    /* create tensors */
+                             { {-1.0F, 2.0F, 3.0F},
-    XTensor * s = NewTensor(sOrder, sDimSize);
+                               {-4.0F, -5.0F, -6.0F} } };
-    XTensor * t = NewTensor(tOrder, tDimSize);
+    DTYPE answer1[4][3] = { {0.0F, 1.0F, 2.0F},
+                            {4.0F, 5.0F, 6.0F},
-    /* initialize variables */
+                            {-1.0F, 2.0F, 3.0F},
-    s->SetData(sData, sUnitNum);
+                            {-4.0F, -5.0F, -6.0F} };
-    t->SetZeroAll();
+    DTYPE answer2[2][6] = { {0.0F, 1.0F, 2.0F, -1.0F, 2.0F, 3.0F},
+                            {4.0F, 5.0F, 6.0F, -4.0F, -5.0F, -6.0F} };
-    /* call merge function */
-    Merge(s, t, 1, 0);
-    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-    /* create tensor */
-    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
-    /* Initialize variables */
-    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
-    /* call merge function */
-    Merge(sGPU, tGPU, 1, 0);
-    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
-    /* destroy variables */
-    delete s, t, sGPU, tGPU;
-    delete[] sDimSize, tDimSize;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete s, t;
-    delete[] sDimSize, tDimSize;
-    return cpuTest;
-#endif // USE_CUDA
-}
-/* case 3: transform a tensor by merging it along with a dimension. 
-* In this case, (2, 3, 4) -> (3, 8), whereToMerge=0, leadingDim=2.
-*/
-bool TestMerge3()
-{
-    /* a source tensor of size (2, 3, 4) */
-    int sOrder = 3;
-    int * sDimSize = new int[sOrder];
-    sDimSize[0] = 2;
-    sDimSize[1] = 3;
-    sDimSize[2] = 4;
-    int sUnitNum = 1;
-    for (int i = 0; i < sOrder; i++)
-        sUnitNum *= sDimSize[i];
-    /* a target tensor of size (8, 3) */
-    int tOrder = 2;
-    int * tDimSize = new int[tOrder];
-    tDimSize[0] = 3;
-    tDimSize[1] = 8;
-    int tUnitNum = 1;
-    for (int i = 0; i < tOrder; i++)
-        tUnitNum *= tDimSize[i];
-    DTYPE sData[2][3][4] = { { {0.0, 1.0, 2.0, 3.0},
-                               {4.0, 5.0, 6.0, 7.0},
-                               {8.0, 9.0, 10.0, 11.0} },
-                             { {0.0, -1.0, -2.0, -3.0},
-                               {-4.0, -5.0, -6.0, -7.0},
-                               {-8.0, -9.0, -10.0, -11.0} } };
-    DTYPE answer[3][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0},
-                           {4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0},
-                           {8.0, 9.0, 10.0, 11.0, -8.0, -9.0, -10.0, -11.0} };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * s = NewTensor(sOrder, sDimSize);
-    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
    /* initialize variables */
    s->SetData(sData, sUnitNum);
-    t->SetZeroAll();
+    t1->SetZeroAll();
+    t2->SetZeroAll();
    /* call merge function */
-    Merge(s, t, 2, 0);
+    Merge(s, t1, 1, 0);
+    Merge(s, t2, 2, 0);
    /* check results */
-    cpuTest = t->CheckData(answer, tUnitNum);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
@@ -245,36 +185,51 @@ bool TestMerge3()
    /* create tensor */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
    /* Initialize variables */
    sGPU->SetData(sData, sUnitNum);
-    tGPU->SetZeroAll();
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
    /* call merge function */
-    Merge(sGPU, tGPU, 2, 0);
+    Merge(sGPU, tGPU1, 1, 0);
+    Merge(sGPU, tGPU2, 2, 0);
    /* check results */
-    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-    delete s, t, sGPU, tGPU;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s, t;
+    delete s;
-    delete[] sDimSize, tDimSize;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 4: merge small tensors into a big tensor. 
+/* 
+case 3: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
 */
-bool TestMerge4()
+bool TestMerge3()
 {
    /* create list */
    XList * smallList = new XList();
@@ -289,10 +244,10 @@ bool TestMerge4()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                           {4.0, 5.0, 6.0, 7.0} };
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
+    DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
-                           {-4.0, -5.0, -6.0, -7.0} };
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };
    /* a target tensor of size (4, 4) */
    int tOrder = 2;
@@ -304,10 +259,10 @@ bool TestMerge4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE answer[4][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE answer[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                           {4.0, 5.0, 6.0, 7.0},
+                           {4.0F, 5.0F, 6.0F, 7.0F},
-                           {0.0, -1.0, -2.0, -3.0},
+                           {0.0F, -1.0F, -2.0F, -3.0F},
-                           {-4.0, -5.0, -6.0, -7.0} };
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -359,24 +314,37 @@ bool TestMerge4()
    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    /* destroy variables */
-    delete[] sDimSize, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    delete smallList;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    delete smallList;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 5: merge small tensors into a big tensor. 
+/* 
+case 4: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
 */
-bool TestMerge5()
+bool TestMerge4()
 {
    /* create list */
    XList * smallList = new XList();
@@ -391,10 +359,10 @@ bool TestMerge5()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
+    DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                           {4.0, 5.0, 6.0, 7.0} };
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
+    DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
-                           {-4.0, -5.0, -6.0, -7.0} };
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };
    /* a target tensor of size (4, 4) */
    int tOrder = 2;
@@ -406,8 +374,8 @@ bool TestMerge5()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];
-    DTYPE answer[2][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0},
+    DTYPE answer[2][8] = { {0.0F, 1.0F, 2.0F, 3.0F, 0.0F, -1.0F, -2.0F, -3.0F},
-                           {4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0} };
+                           {4.0F, 5.0F, 6.0F, 7.0F, -4.0F, -5.0F, -6.0F, -7.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -459,15 +427,27 @@ bool TestMerge5()
    /* check results */
    cpuTest = tGPU->CheckData(answer, tUnitNum);
-    delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    /* destroy variables */
-    delete[] sDimSize, tDimSize;
+    delete s1;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
    delete smallList;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete s1, s2, t;
+    delete s1;
-    delete[] sDimSize, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
+    delete smallList;
    return cpuTest;
 #endif // USE_CUDA
@@ -479,10 +459,9 @@ bool TestMerge5()
 */
 /* test for Merge Function */
-extern "C"
 bool TestMerge()
 {
-    XPRINT(0, stdout, "[TEST MERGE] -------------\n");
+    XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -522,15 +501,6 @@ bool TestMerge()
    else
        XPRINT(0, stdout, ">> case 4 passed!\n");
-    /* case 5 test */
-    caseFlag = TestMerge5();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 5 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 5 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TMerge.h
+++ b/source/test/TMerge.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MERGE_H__
 #define __TEST_MERGE_H__
-#include "../core/Merge.h"
+#include "../core/shape/Merge.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMultiply.cpp
+++ b/source/test/TMultiply.cpp
@@ -19,17 +19,18 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#include "../XTensor.h"
+#include "TMultiply.h"
-#include "../XDevice.h"
-#include "../core/Multiply.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i) 
-* In this case, (2 * 1)  (2 * 1) -> (2 * 1), leadingDim=0, alpha=0.
+/* 
+case 1: element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i) 
+In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
 */
 bool TestMultiply1()
 {
-	/* a source tensor of size 2 * 1 */
+	/* a source tensor of size (2, 1) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -39,7 +40,7 @@ bool TestMultiply1()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size 2 * 1 */
+	/* a source tensor of size (2, 1) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -49,7 +50,7 @@ bool TestMultiply1()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size 2 * 1 */
+	/* a target tensor of size (2, 1) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -59,9 +60,12 @@ bool TestMultiply1()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	DTYPE sData1[2][1] = { {0.0}, {1.0} };
+	DTYPE sData1[2][1] = { {0.0F}, 
-	DTYPE sData2[2][1] = { {2.0}, {3.0} };
+                           {1.0F} };
-	DTYPE answer[2][1] = { {0.0}, {3.0} };
+	DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+	DTYPE answer[2][1] = { {0.0F},
+                           {3.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -76,7 +80,7 @@ bool TestMultiply1()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(s1, s2, t, 0);
 	/* check results */
@@ -96,15 +100,22 @@ bool TestMultiply1()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
@@ -120,12 +131,14 @@ bool TestMultiply1()
 #endif // USE_CUDA
 }
-/* case 2: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
+/* 
-* In this case, (2 * 2)  (2 * 2) -> (2 * 2), leadingDim=0, alpha=0.
+case 2: element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
 bool TestMultiply2()
 {
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -135,7 +148,7 @@ bool TestMultiply2()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -145,7 +158,7 @@ bool TestMultiply2()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size (2 * 2) */
+	/* a target tensor of size (2, 2) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -155,12 +168,12 @@ bool TestMultiply2()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	DTYPE sData1[2][2] = { {0.0, 1.0},
+	DTYPE sData1[2][2] = { {0.0F, 1.0F},
-	                       {2.0, 3.0} };
+	                       {2.0F, 3.0F} };
-	DTYPE sData2[2][2] = { {0.0, 1.0},
+	DTYPE sData2[2][2] = { {0.0F, 1.0F},
-	                       {2.0, 3.0} };
+	                       {2.0F, 3.0F} };
-	DTYPE answer[2][2] = { {0.0, 1.0},
+	DTYPE answer[2][2] = { {0.0F, 1.0F},
-	                       {4.0, 9.0} };
+	                       {4.0F, 9.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -175,7 +188,7 @@ bool TestMultiply2()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(s1, s2, t, 0);
 	/* check results */
@@ -195,32 +208,44 @@ bool TestMultiply2()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
+/* 
-* In this case, (2 * 2)  (2 * 2) -> (2 * 2), leadingDim=1, alpha=0.
+case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
+In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
 */
 bool TestMultiply3()
 {
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
 	int * sDimSize1 = new int[sOrder1];
 	sDimSize1[0] = 2;
@@ -230,7 +255,7 @@ bool TestMultiply3()
 	for (int i = 0; i < sOrder1; i++)
 		sUnitNum1 *= sDimSize1[i];
-	/* a source tensor of size (2 * 2) */
+	/* a source tensor of size (2, 2) */
 	int sOrder2 = 2;
 	int * sDimSize2 = new int[sOrder2];
 	sDimSize2[0] = 2;
@@ -240,7 +265,7 @@ bool TestMultiply3()
 	for (int i = 0; i < sOrder2; i++)
 		sUnitNum2 *= sDimSize2[i];
-	/* a target tensor of size (2 * 2) */
+	/* a target tensor of size (2, 2) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -250,12 +275,12 @@ bool TestMultiply3()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	DTYPE sData1[2][2] = { {0.0, 1.0},
+	DTYPE sData1[2][2] = { {0.0F, 1.0F},
-	                       {2.0, 3.0} };
+	                       {2.0F, 3.0F} };
-	DTYPE sData2[2][2] = { {0.0, 1.0},
+	DTYPE sData2[2][2] = { {0.0F, 1.0F},
-	                       {2.0, 3.0} };
+	                       {2.0F, 3.0F} };
-	DTYPE answer[2][2] = { {0.0, 1.0},
+	DTYPE answer[2][2] = { {0.0F, 1.0F},
-	                       {4.0, 9.0} };
+	                       {4.0F, 9.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -270,7 +295,7 @@ bool TestMultiply3()
 	s2->SetData(sData2, sUnitNum2);
 	t->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(s1, s2, t, 1);
 	/* check results */
@@ -290,21 +315,32 @@ bool TestMultiply3()
 	sGPU2->SetData(sData2, sUnitNum2);
 	tGPU->SetZeroAll();
-	/* call multiplyelementwise function */
+	/* call MultiplyElementWise function */
 	Multiply(sGPU1, sGPU2, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
 	/* destroy variables */
-	delete s1, s2, t, sGPU1, sGPU2, tGPU;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete sGPU1;
+    delete sGPU2;
+    delete tGPU;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest && gpuTest;
 #else
    /* destroy variables */
-	delete s1, s2, t;
+    delete s1;
-	delete[] sDimSize1, sDimSize2, tDimSize;
+    delete s2;
+    delete t;
+    delete[] sDimSize1;
+    delete[] sDimSize2;
+    delete[] tDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -316,10 +352,9 @@ TODO!!
 */
 /* test for MultiplyElementWise Function */
-extern "C"
 bool TestMultiply()
 {
-	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] -------------\n");
+	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TMultiply.h
+++ b/source/test/TMultiply.h
@@ -19,10 +19,10 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#ifndef __TEST_MULTIPLY_H__
+#ifndef __TEST_MULTIPLYELEMENTWISE_H__
-#define __TEST_MULTIPLY_H__
+#define __TEST_MULTIPLYELEMENTWISE_H__
-#include "../core/Multiply.h"
+#include "../core/arithmetic/Multiply.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
@@ -19,15 +19,14 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "TNegate.h"
-#include "../XDevice.h"
-#include "../core/Negate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* case 1: set every entry to its minus value */
 bool TestNegate1()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -37,12 +36,12 @@ bool TestNegate1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {1.0, -2.0}, 
+	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
-	                      {-3.0, 4.0},
+	                      {-3.0F, 4.0F},
-	                      {5.0, -6.0} };
+	                      {5.0F, -6.0F} };
-	DTYPE answer[3][2] = { {-1.0, 2.0},
+	DTYPE answer[3][2] = { {-1.0F, 2.0F},
-	                       {3.0, -4.0},
+	                       {3.0F, -4.0F},
-	                       {-5.0, 6.0} };
+	                       {-5.0F, 6.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -53,7 +52,7 @@ bool TestNegate1()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(a);
 	/* check results */
@@ -69,14 +68,15 @@ bool TestNegate1()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(aGPU);
 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -92,7 +92,7 @@ bool TestNegate1()
 /* case 2: set every entry to its minus value */
 bool TestNegate2()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -102,12 +102,12 @@ bool TestNegate2()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {0.0, 0.0},
+	DTYPE aData[3][2] = { {0.0F, 0.0F},
-	                      {0.0, 0.0},
+	                      {0.0F, 0.0F},
-	                      {0.0, 0.0} };
+	                      {0.0F, 0.0F} };
-	DTYPE answer[3][2] = { {-0.0, -0.0},
+	DTYPE answer[3][2] = { {-0.0F, -0.0F},
-	                       {-0.0, -0.0},
+	                       {-0.0F, -0.0F},
-	                       {-0.0, -0.0} };
+	                       {-0.0F, -0.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -118,7 +118,7 @@ bool TestNegate2()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(a);
 	/* check results */
@@ -134,14 +134,15 @@ bool TestNegate2()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call negate function */
+	/* call Negate function */
 	Negate(aGPU);
 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -160,10 +161,9 @@ TODO!!
 */
 /* test for Negate Function */
-extern "C"
 bool TestNegate()
 {
-	XPRINT(0, stdout, "[TEST NEGATE] -------------\n");
+	XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TNegate.h
+++ b/source/test/TNegate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NEGATE_H__
 #define __TEST_NEGATE_H__
-#include "../core/Negate.h"
+#include "../core/arithmetic/Negate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
@@ -19,17 +19,19 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-20
 */
-#include "../XTensor.h"
+#include "TNormalize.h"
-#include "../XDevice.h"
-#include "../core/Normalize.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: normalized the data with normal distribution 
-* In this case, dim=0.
+/*
+case 1: normalized the data with normal distribution 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
+where a and b are the scalar and bias respectively, 
+and \epsilon is the adjustment parameter.
 */
 bool TestNormalize1()
 {
-	/* a source tensor of size 2 * 3 */
+	/* a source tensor of size (2, 3) */
 	int sOrder = 2;
 	int * sDimSize = new int[sOrder];
 	sDimSize[0] = 2;
@@ -39,7 +41,7 @@ bool TestNormalize1()
 	for (int i = 0; i < sOrder; i++)
 		sUnitNum *= sDimSize[i];
-	/* a target tensor of size 2 * 3 */
+	/* a target tensor of size (2, 3) */
 	int tOrder = 2;
 	int * tDimSize = new int[tOrder];
 	tDimSize[0] = 2;
@@ -49,7 +51,7 @@ bool TestNormalize1()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];
-	/* a mean tensor of size 3 */
+	/* a mean tensor of size (3) */
 	int meanOrder = 1;
 	int * meanDimSize = new int[meanOrder];
 	meanDimSize[0] = 3;
@@ -58,7 +60,7 @@ bool TestNormalize1()
 	for (int i = 0; i < meanOrder; i++)
 		meanUnitNum *= meanDimSize[i];
-	/* a var tensor of size 3 */
+	/* a variance tensor of size (3) */
 	int varOrder = 1;
 	int * varDimSize = new int[varOrder];
 	varDimSize[0] = 3;
@@ -67,7 +69,7 @@ bool TestNormalize1()
 	for (int i = 0; i < varOrder; i++)
 		varUnitNum *= varDimSize[i];
-	/* a a tensor of size 2 * 3 */
+	/* a scalar tensor of size (2, 3) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 2;
@@ -77,7 +79,7 @@ bool TestNormalize1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	/* a b tensor of size 2 * 3 */
+	/* a bias tensor of size (2, 3) */
 	int bOrder = 2;
 	int * bDimSize = new int[bOrder];
 	bDimSize[0] = 2;
@@ -87,41 +89,39 @@ bool TestNormalize1()
 	for (int i = 0; i < bOrder; i++)
 		bUnitNum *= bDimSize[i];
-	DTYPE sData[2][3] = { {0.5, -1.0, 2.0},
+	DTYPE sData[2][3] = { {1.0F, 2.0F, 3.0F},
-	                      {3.5, -4.5, 1.0} };
+	                      {1.5F, 2.5F, 3.5F} };
-	DTYPE meanData[3] = {2.0, -2.75, 1.5};
+	DTYPE meanData[3] = {1.0F, 1.5F, 2.0F};
-	DTYPE varData[3] = {4.5, 6.125, 0.5};
+	DTYPE varData[3] = {1.0F, 1.0F, 4.0F};
-	DTYPE aData[2][3] = { {0.0, 0.0, 0.0},
+    DTYPE aData[2][3] = { {1.0F, 1.0F, 1.0F},
-	                      {0.0, 0.0, 0.0} };
+	                      {1.0F, 1.0F, 1.0F} };
-	DTYPE bData[2][3] = { {0.0, 0.0, 0.0},
+	DTYPE answer[2][3] = { {0.0F, 0.5F, 0.5F},
-	                      {0.0, 0.0, 0.0} };
+	                       {0.5F, 1.0F, 0.75F} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 0.0},
-	                       {0.0, 0.0, 0.0} };
 	/* CPU test */
 	bool cpuTest = true;
 	/* create tensors */
 	XTensor * s = NewTensor(sOrder, sDimSize);
+	XTensor * t = NewTensor(tOrder, tDimSize);
 	XTensor * mean = NewTensor(meanOrder, meanDimSize);
 	XTensor * var = NewTensor(varOrder, varDimSize);
 	XTensor * a = NewTensor(aOrder, aDimSize);
 	XTensor * b = NewTensor(bOrder, bDimSize);
-	XTensor * t = NewTensor(tOrder, tDimSize);
 	/* initialize variables */
 	s->SetData(sData, sUnitNum);
 	mean->SetData(meanData, meanUnitNum);
 	var->SetData(varData, varUnitNum);
 	a->SetData(aData, aUnitNum);
-	b->SetData(bData, bUnitNum);
+	b->SetZeroAll();
 	t->SetZeroAll();
 	/* call normalize function */
-	Normalize(s, t, 0, mean, var, a, b, 0.0);
+	Normalize(s, t, 0, mean, var, a, b, 0.0F);
 	/* check results */
-	cpuTest = t->CheckData(answer, tUnitNum);
+	cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -140,24 +140,50 @@ bool TestNormalize1()
 	meanGPU->SetData(meanData, meanUnitNum);
 	varGPU->SetData(varData, varUnitNum);
 	aGPU->SetData(aData, aUnitNum);
-	bGPU->SetData(bData, bUnitNum);
+	bGPU->SetZeroAll();
 	tGPU->SetZeroAll();
-	/* call normalize function */
+	/* call Normalize function */
-	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0);
+	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);
 	/* check results */
-	gpuTest = tGPU->CheckData(answer, tUnitNum);
+	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0);
 	/* destroy variables */
-	delete s, t, mean, var, a, b, sGPU, tGPU, meanGPU, varGPU, aGPU, bGPU;
+	delete s;
-	delete[] sDimSize, tDimSize, meanDimSize, varDimSize, aDimSize, bDimSize;
+	delete t;
+	delete mean;
+	delete var;
+	delete a;
+	delete b;
+	delete sGPU;
+	delete tGPU;
+	delete meanGPU;
+	delete varGPU;
+	delete aGPU;
+	delete bGPU;
+	delete[] sDimSize;
+	delete[] tDimSize;
+	delete[] meanDimSize;
+	delete[] varDimSize;
+	delete[] aDimSize;
+	delete[] bDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete s, t, mean, var, a, b;
+	delete s;
-	delete[] sDimSize, tDimSize, meanDimSize, varDimSize, aDimSize, bDimSize;
+	delete t;
+	delete mean;
+	delete var;
+	delete a;
+	delete b;
+	delete[] sDimSize;
+	delete[] tDimSize;
+	delete[] meanDimSize;
+	delete[] varDimSize;
+	delete[] aDimSize;
+	delete[] bDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -169,10 +195,9 @@ TODO!!
 */
 /* test for Normalize Function */
-extern "C"
 bool TestNormalize()
 {
-	XPRINT(0, stdout, "[TEST NORMALIZE] -------------\n");
+	XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TNormalize.h
+++ b/source/test/TNormalize.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NORMALIZE_H__
 #define __TEST_NORMALIZE_H__
-#include "../core/Normalize.h"
+#include "../core/math/Normalize.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
@@ -19,17 +19,18 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#include "../XTensor.h"
+#include "../XUtility.h"
-#include "../XDevice.h"
+#include "TPower.h"
-#include "../core/Power.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: get the power(a, p) 
-* In this case, p=2.
+/* 
+case 1: get the power(a, p) 
+In this case, p=2.
 */
 bool TestPower1()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -39,12 +40,12 @@ bool TestPower1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {1.0, 2.0},
+	DTYPE aData[3][2] = { {1.0F, 2.0F},
-	                      {3.0, 4.0},
+	                      {3.0F, 4.0F},
-	                      {5.0, 6.0} };
+	                      {5.0F, 6.0F} };
-	DTYPE answer[3][2] = { {1.0, 4.0},
+	DTYPE answer[3][2] = { {1.0F, 4.0F},
-	                       {9.0, 16.0},
+	                       {9.0F, 16.0F},
-	                       {25.0, 36.0} };
+	                       {25.0F, 36.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -55,11 +56,11 @@ bool TestPower1()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
-	Power(a, 2.0);
+	Power(a, 2.0F);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -72,13 +73,14 @@ bool TestPower1()
 	aGPU->SetData(aData, aUnitNum);
 	/* call power function */
-	Power(aGPU, 2.0);
+	Power(aGPU, 2.0F);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -91,12 +93,13 @@ bool TestPower1()
 #endif // USE_CUDA
 }
-/* case 2: get the power(a, p)
+/* 
-* In this case, p=1.
+case 2: get the power(a, p)
+In this case, p=1.
 */
 bool TestPower2()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -106,12 +109,12 @@ bool TestPower2()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {0.0, 1.0},
+	DTYPE aData[3][2] = { {0.0F, 1.0F},
-	                      {2.0, 3.0},
+	                      {2.0F, 3.0F},
-	                      {4.0, 5.0} };
+	                      {4.0F, 5.0F} };
-	DTYPE answer[3][2] = { {0.0, 1.0},
+	DTYPE answer[3][2] = { {0.0F, 1.0F},
-	                       {2.0, 3.0},
+	                       {2.0F, 3.0F},
-	                       {4.0, 5.0} };
+	                       {4.0F, 5.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -122,11 +125,11 @@ bool TestPower2()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
-	Power(a, 1.0);
+	Power(a, 1.0F);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -138,14 +141,15 @@ bool TestPower2()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
-	Power(aGPU, 1.0);
+	Power(aGPU, 1.0F);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -158,12 +162,13 @@ bool TestPower2()
 #endif // USE_CUDA
 }
-/* case 3: get the power(a, p)
+/* 
-* In this case, p=0.
+case 3: get the power(a, p)
+In this case, p=0.
 */
 bool TestPower3()
 {
-	/* a tensor of size 3 * 2 */
+	/* a tensor of size (3, 2) */
 	int aOrder = 2;
 	int * aDimSize = new int[aOrder];
 	aDimSize[0] = 3;
@@ -173,12 +178,12 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];
-	DTYPE aData[3][2] = { {0.0, 1.0},
+	DTYPE aData[3][2] = { {0.0F, 1.0F},
-	                      {2.0, 3.0},
+	                      {2.0F, 3.0F},
-	                      {4.0, 5.0} };
+	                      {4.0F, 5.0F} };
-	DTYPE answer[3][2] = { {1.0, 1.0},
+	DTYPE answer[3][2] = { {1.0F, 1.0F},
-	                       {1.0, 1.0},
+	                       {1.0F, 1.0F},
-	                       {1.0, 1.0} };
+	                       {1.0F, 1.0F} };
 	/* CPU test */
 	bool cpuTest = true;
@@ -189,11 +194,11 @@ bool TestPower3()
 	/* initialize variables */
 	a->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
-	Power(a, 0.0);
+	Power(a, 0.0F);
 	/* check results */
-	cpuTest = a->CheckData(answer, aUnitNum);
+	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -205,14 +210,15 @@ bool TestPower3()
 	/* Initialize variables */
 	aGPU->SetData(aData, aUnitNum);
-	/* call power function */
+	/* call Power function */
-	Power(aGPU, 0.0);
+	Power(aGPU, 0.0F);
 	/* check results */
-	gpuTest = aGPU->CheckData(answer, aUnitNum);
+	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
 	/* destroy variables */
-	delete a, aGPU;
+	delete a;
+    delete aGPU;
 	delete[] aDimSize;
 	return cpuTest && gpuTest;
@@ -231,10 +237,9 @@ TODO!!
 */
 /* test for Power Function */
-extern "C"
 bool TestPower()
 {
-	XPRINT(0, stdout, "[TEST POWER] -------------\n");
+	XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */

--- a/source/test/TPower.h
+++ b/source/test/TPower.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_POWER_H__
 #define __TEST_POWER_H__
-#include "../core/Power.h"
+#include "../core/math/Power.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
@@ -19,15 +19,17 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "TRectify.h"
-#include "../XDevice.h"
-#include "../function/Rectify.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: rectify function y = max(0, x) */
+/* 
+case 1: test rectify function
+In this case, y = max(0, x) 
+*/
 bool TestRectify1()
 {
-    /* a x tensor of size 2 * 3 */
+    /* a x tensor of size (2, 3) */
    int xOrder = 2;
    int * xDimSize = new int[xOrder];
    xDimSize[0] = 2;
@@ -37,7 +39,7 @@ bool TestRectify1()
    for (int i = 0; i < xOrder; i++)
        xUnitNum *= xDimSize[i];
-    /* a y tensor of size 2 * 3 */
+    /* a y tensor of size (2, 3) */
    int yOrder = 2;
    int * yDimSize = new int[yOrder];
    yDimSize[0] = 2;
@@ -47,10 +49,10 @@ bool TestRectify1()
    for (int i = 0; i < yOrder; i++)
        yUnitNum *= yDimSize[i];
-    DTYPE xData[2][3] = { {0.0, -1.0, 2.0},
+    DTYPE xData[2][3] = { {0.0F, -1.0F, 2.0F},
-                          {3.0, -4.0, -5.0} };
+                          {3.0F, -4.0F, -5.0F} };
-    DTYPE answer[2][3] = { {0.0, 0.0, 2.0},
+    DTYPE answer[2][3] = { {0.0F, 0.0F, 2.0F},
-                           {3.0, 0.0, 0.0} };
+                           {3.0F, 0.0F, 0.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -63,7 +65,7 @@ bool TestRectify1()
    x->SetData(xData, xUnitNum);
    y->SetZeroAll();
-    /* call rectify function */
+    /* call Rectify function */
    Rectify(x, y);
    /* check results */
@@ -81,32 +83,41 @@ bool TestRectify1()
 	xGPU->SetData(xData, xUnitNum);
 	yGPU->SetZeroAll();
-	/* call rectify function */
+	/* call Rectify function */
 	Rectify(xGPU, yGPU);
 	/* check results */
 	gpuTest = yGPU->CheckData(answer, yUnitNum);
 	/* destroy variables */
-	delete x, y, xGPU, yGPU;
+	delete x;
-	delete[] xDimSize, yDimSize;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+	delete[] xDimSize;
+    delete[] yDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete x, y;
+	delete x;
-	delete[] xDimSize, yDimSize;
+    delete y;
+	delete[] xDimSize;
+    delete[] yDimSize;
 	return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x) 
+/* 
-* In this case, lossName=CROSSENTROPY.
+case 2: backward computation 
+dE/dx = dE/dy * dy/dx 
+rectified: y = max(0, x) 
+In this case, lossName=CROSSENTROPY.
 */
 bool TestRectify2()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -116,46 +127,6 @@ bool TestRectify2()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
 	DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
 	                      {2.0F, 4.0F, 5.0F} };
 	DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
@@ -172,150 +143,23 @@ bool TestRectify2()
 	/* create tensors */
 	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
+	XTensor * y = NewTensor(xOrder, xDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
+	XTensor * gold = NewTensor(xOrder, xDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
+	XTensor * dedy = NewTensor(xOrder, xDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
+	XTensor * dedx = NewTensor(xOrder, xDimSize);
-	/* initialize variables */
-	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
-	gold->SetData(goldData, goldUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
-	dedx->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
-	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-	/* create tensors */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
-	/* initialize variables */
-	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
-	dedxGPU->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
-	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest && gpuTest;
-#else
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest;
-#endif // USE_CUDA
-}
-/* case 3: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x)
-* In this case, lossName=SQUAREDERROR.
-*/
-bool TestRectify3()
-{
-	/* a x tensor of size 2 * 3 */
-	int xOrder = 2;
-	int * xDimSize = new int[xOrder];
-	xDimSize[0] = 2;
-	xDimSize[1] = 3;
-	int xUnitNum = 1;
-	for (int i = 0; i < xOrder; i++)
-		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {1.0, 1.0, 2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE yData[2][3] = { {1.0, 1.0, 2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
-	                         {1.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {0.0, 0.0, 1.0},
-	                         {1.0, 3.0, 4.0} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 1.0},
-	                       {1.0, 3.0, 4.0} };
-	/* CPU test */
-	bool cpuTest = true;
-	/* create tensors */
-	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
 	/* initialize variables */
 	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
+	y->SetData(yData, xUnitNum);
-	gold->SetData(goldData, goldUnitNum);
+	gold->SetData(goldData, xUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
+	dedy->SetData(dedyData, xUnitNum);
 	dedx->SetZeroAll();
-	/* call rectifybackward function */
+	/* call RectifyBackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, CROSSENTROPY);
+	RectifyBackward(gold, y, x, dedy, dedx, NOLOSS);
 	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
+	cpuTest = dedx->CheckData(answer, xUnitNum);
 #ifdef USE_CUDA
 	/* GPU test */
@@ -323,160 +167,46 @@ bool TestRectify3()
 	/* create tensors */
 	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * yGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * goldGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedyGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
+	XTensor * dedxGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
 	/* initialize variables */
 	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
+	yGPU->SetData(yData, xUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
+	goldGPU->SetData(goldData, xUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
+	dedyGPU->SetData(dedyData, xUnitNum);
 	dedxGPU->SetZeroAll();
 	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
+	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
 	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
+	gpuTest = dedxGPU->CheckData(answer, xUnitNum);
 	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
+    delete x;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
+    delete y;
+    delete dedy;
+    delete dedx;
+    delete gold;
+    delete xGPU;
+    delete yGPU;
+    delete dedyGPU;
+    delete dedxGPU;
+    delete goldGPU;
+	delete[] xDimSize;
 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
+    delete x;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
+    delete y;
+    delete dedy;
-	return cpuTest;
+    delete dedx;
-#endif // USE_CUDA
+    delete gold;
-}
+	delete[] xDimSize;
-/* case 4: backward computation dE/dx = dE/dy * dy/dx rectified: y = max(0, x)
-* In this case, lossName=ONEHOTERROR.
-*/
-bool TestRectify4()
-{
-	/* a x tensor of size 2 * 3 */
-	int xOrder = 2;
-	int * xDimSize = new int[xOrder];
-	xDimSize[0] = 2;
-	xDimSize[1] = 3;
-	int xUnitNum = 1;
-	for (int i = 0; i < xOrder; i++)
-		xUnitNum *= xDimSize[i];
-	/* a y tensor of size 2 * 3 */
-	int yOrder = 2;
-	int * yDimSize = new int[yOrder];
-	yDimSize[0] = 2;
-	yDimSize[1] = 3;
-	int yUnitNum = 1;
-	for (int i = 0; i < yOrder; i++)
-		yUnitNum *= yDimSize[i];
-	/* a gold tensor of size 2 * 3 */
-	int goldOrder = 2;
-	int * goldDimSize = new int[goldOrder];
-	goldDimSize[0] = 2;
-	goldDimSize[1] = 3;
-	int goldUnitNum = 1;
-	for (int i = 0; i < goldOrder; i++)
-		goldUnitNum *= goldDimSize[i];
-	/* a dedy tensor of size 2 * 3 */
-	int dedyOrder = 2;
-	int * dedyDimSize = new int[dedyOrder];
-	dedyDimSize[0] = 2;
-	dedyDimSize[1] = 3;
-	int dedyUnitNum = 1;
-	for (int i = 0; i < dedyOrder; i++)
-		dedyUnitNum *= dedyDimSize[i];
-	/* a dedx tensor of size 2 * 3 */
-	int dedxOrder = 2;
-	int * dedxDimSize = new int[dedxOrder];
-	dedxDimSize[0] = 2;
-	dedxDimSize[1] = 3;
-	int dedxUnitNum = 1;
-	for (int i = 0; i < dedxOrder; i++)
-		dedxUnitNum *= dedxDimSize[i];
-	DTYPE xData[2][3] = { {1.0, 1.0, -2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE yData[2][3] = { {1.0, 1.0, 0.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE goldData[2][3] = { {1.0, 0.0, 1.0},
-	                         {1.0, 1.0, 0.0} };
-	DTYPE dedyData[2][3] = { {0.0, 0.0, -1.0},
-	                         {1.0, 3.0, 0.0} };
-	DTYPE answer[2][3] = { {0.0, 0.0, 0.0},
-	                       {1.0, 3.0, 0.0} };
-	/* CPU test */
-	bool cpuTest = true;
-	/* create tensors */
-	XTensor * x = NewTensor(xOrder, xDimSize);
-	XTensor * y = NewTensor(yOrder, yDimSize);
-	XTensor * gold = NewTensor(goldOrder, goldDimSize);
-	XTensor * dedy = NewTensor(dedyOrder, dedyDimSize);
-	XTensor * dedx = NewTensor(dedxOrder, dedxDimSize);
-	/* initialize variables */
-	x->SetData(xData, xUnitNum);
-	y->SetData(yData, yUnitNum);
-	gold->SetData(goldData, goldUnitNum);
-	dedy->SetData(dedyData, dedyUnitNum);
-	dedx->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(gold, y, x, dedy, dedx, ONEHOTERROR);
-	/* check results */
-	cpuTest = dedx->CheckData(answer, dedxUnitNum);
-#ifdef USE_CUDA
-	/* GPU test */
-	bool gpuTest = true;
-	/* create tensors */
-	XTensor * xGPU = NewTensor(xOrder, xDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * yGPU = NewTensor(yOrder, yDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * goldGPU = NewTensor(goldOrder, goldDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedyGPU = NewTensor(dedyOrder, dedyDimSize, X_FLOAT, 1.0F, 0);
-	XTensor * dedxGPU = NewTensor(dedxOrder, dedxDimSize, X_FLOAT, 1.0F, 0);
-	/* initialize variables */
-	xGPU->SetData(xData, xUnitNum);
-	yGPU->SetData(yData, yUnitNum);
-	goldGPU->SetData(goldData, goldUnitNum);
-	dedyGPU->SetData(dedyData, dedyUnitNum);
-	dedxGPU->SetZeroAll();
-	/* call rectifybackward function */
-	RectifyBackward(goldGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
-	/* check results */
-	gpuTest = dedxGPU->CheckData(answer, dedxUnitNum);
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold, xGPU, yGPU, dedyGPU, dedxGPU, goldGPU;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
-	return cpuTest && gpuTest;
-#else
-	/* destroy variables */
-	delete x, y, dedy, dedx, gold;
-	delete[] xDimSize, yDimSize, dedyDimSize, dedxDimSize, goldDimSize;
 	return cpuTest;
 #endif // USE_CUDA
@@ -488,10 +218,9 @@ TODO!!
 */
 /* test for Rectify Function */
-extern "C"
 bool TestRectify()
 {
-    XPRINT(0, stdout, "[TEST RECTIFY] -------------\n");
+    XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -514,26 +243,6 @@ bool TestRectify()
 	else
 		XPRINT(0, stdout, ">> case 2 passed!\n");
-	/* case 3 test */
-	caseFlag = TestRectify3();
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 3 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 3 passed!\n");
-	/* case 4 test */
-	caseFlag = TestRectify4();
-	if (!caseFlag) {
-		returnFlag = false;
-		XPRINT(0, stdout, ">> case 4 failed!\n");
-	}
-	else
-		XPRINT(0, stdout, ">> case 4 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
@@ -16,212 +16,129 @@
 */
 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
 */
-#include "../XTensor.h"
+#include "TReduceMax.h"
-#include "../XDevice.h"
-#include "../core/ReduceMax.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
-                /* case 1 */
-    bool TestReduceMax1()
-    {
-        /* a tensor of size 2 * 4 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 2;
-        dimSize[1] = 4;
-        int unitNum = 1;
+/* 
-        for (int i = 0; i < order; i++)
+case 1: get the max value of the items along a dimension of the tensor. 
-            unitNum *= dimSize[i];
+In this case,
-        /* a tensor of size 4 */
+(2, 4) -> (4), dim = 0
-        int * dimSize_reduce_a = new int[order_reduce];
+(2, 4) -> (2), dim = 1
-        dimSize_reduce_a[0] = 4;
+*/
+bool TestReduceMax1()
-        int unitNum_a = 1;
+{
-        for (int i = 0; i < order_reduce; i++)
+    /* a input tensor of size (2, 4) */
-            unitNum_a *= dimSize_reduce_a[i];
+    int sOrder = 2;
-        /* a tensor of size 2 */
+    int * sDimSize = new int[sOrder];
-        int * dimSize_reduce_b = new int[order_reduce];
+    sDimSize[0] = 2;
-        dimSize_reduce_b[0] = 2;
+    sDimSize[1] = 4;
-        int unitNum_b = 1;
+    int sUnitNum = 1;
-        for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < sOrder; i++)
-            unitNum_b *= dimSize_reduce_b[i];
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (4) */
-        DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    int tOrder1 = 1;
-                              { 4.0,   5.0,   6.0,   7.0 } };
+    int * tDimSize1 = new int[tOrder1];
-        DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
+    tDimSize1[0] = 4;
-                              { -7.0, -9.0, -11.0, -13.0 } };
-        DTYPE answer_a[4] = { 4.0,  5.0,  6.0,  7.0 };
+    int tUnitNum1 = 1;
-        DTYPE answer_b[2] = { 1.0,  -7.0 };
+    for (int i = 0; i < tOrder1; i++)
+        tUnitNum1 *= tDimSize1[i];
+    /* a output tensor of size (2) */
+    int tOrder2 = 1;
+    int * tDimSize2 = new int[tOrder2];
+    tDimSize2[0] = 2;
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
+    DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
+                          {4.0F, 1.0F, 6.0F, 7.0F} };
+    DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
+    DTYPE answer2[2] = {5.0F, 7.0F};
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-        a->SetData(aData, unitNum);
+    s->SetData(sData, sUnitNum);
-        b->SetData(bData, unitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
-        /* call reduce max function */
+    /* call ReduceMax function */
-        ReduceMax(a, reduce_a, 0);
+    ReduceMax(s, t1, 0);
-        ReduceMax(b, reduce_b, 1);
+    ReduceMax(s, t2, 1);
-        //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
-        //for (int i = 0; i < unitNum_a; i++)
-        //    printf("%f ", *reduce_a_data++);
-        //printf("\n");
-        //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-        //for (int i = 0; i < unitNum_b; i++)
-        //    printf("%f ", *reduce_b_data++);
    /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
-        aGPU->SetData(aData, unitNum);
-        bGPU->SetData(bData, unitNum);
-        /* call reduce max function */
-        ReduceMax(aGPU, reduce_aGPU, 0);
-        ReduceMax(bGPU, reduce_bGPU, 1);
-        /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-        /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
-        return cpuTest && gpuTest;
-#else
-        /* destroy variables */
-        delete a;
-        delete b;
-        return cpuTest;
-#endif // USE_CUDA
-    }
-    bool TestReduceMaxForLargescale()
-    {
-        /* a tensor of size 10000 * 500 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 10000;
-        dimSize[1] = 500;
-        int unitNum = 1;
-        for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
-        /* a tensor of size 500 */
-        int * dimSize_reduce_a = new int[order_reduce];
-        dimSize_reduce_a[0] = 500;
-        int unitNum_a = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_a *= dimSize_reduce_a[i];
-        /* a tensor of size 10000 */
-        int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 10000;
-        int unitNum_b = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
-        DTYPE * data = new DTYPE[5000000];
-        DTYPE * tmp = data;
-        for (int i = 0; i < unitNum; i++)
-            *tmp++ = 1;
-        DTYPE answer_a[500];
-        for (int i = 0; i < unitNum_a; i++)
-            answer_a[i] = 1;
-        DTYPE answer_b[10000];
-        for (int i = 0; i < unitNum_b; i++)
-            answer_b[i] = 1;
-        /* CPU test */
-        bool cpuTest = true;
    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-        a->SetData(data, unitNum);
+    sGPU->SetData(sData, sUnitNum);
-        b->SetData(data, unitNum);
+    tGPU1->SetZeroAll();
-        /* call reduce max function */
+    tGPU2->SetZeroAll();
-        ReduceMax(a, reduce_a, 0);
-        ReduceMax(b, reduce_b, 1);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-#ifdef USE_CUDA
+    /* call ReduceMax function */
-        /* GPU test */
+    ReduceMax(sGPU, tGPU1, 0);
-        bool gpuTest = true;
+    ReduceMax(sGPU, tGPU2, 1);
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
-        aGPU->SetData(data, unitNum);
-        bGPU->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceMax(aGPU, reduce_aGPU, 0);
-        ReduceMax(bGPU, reduce_bGPU, 1);
    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-        delete a;
+    delete s;
-        delete b;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
-    }
+}
-    /* other cases */
+/* other cases */
-    /*
+/*
-    TODO!!
+TODO!!
-    */
+*/
-    /* test for Sum Function */
+/* test for ReduceMax Function */
-    extern "C"
+bool TestReduceMax()
-        bool TestReduceMax()
+{
-    {
+    XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n");
-        XPRINT(0, stdout, "[TEST ReduceMax]\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -233,19 +150,10 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-        /* case 2 test */
+    /* other cases test */
-        caseFlag = TestReduceMaxForLargescale();
+    /*
-        if (!caseFlag) {
+    TODO!!
-            returnFlag = false;
+    */
-            XPRINT(0, stdout, ">> case 2 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 2 passed!\n");
-        ///* other cases test */
-        ///*
-        //TODO!!
-        //*/
    if (returnFlag) {
        XPRINT(0, stdout, ">> All Passed!\n");
@@ -258,4 +166,4 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    return returnFlag;
    }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
@@ -16,20 +16,19 @@
 */
 /*
-* $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-30
 */
 #ifndef __TEST_REDUCEMAX_H__
 #define __TEST_REDUCEMAX_H__
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
 /* test for ReduceMax Function */
 extern "C"
 bool TestReduceMax();
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCEMAX_H__
--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
@@ -19,211 +19,121 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TReduceMean.h"
-#include "../XDevice.h"
-#include "../core/ReduceMean.h"
-#include "../core/ReduceMax.h"
-#include "../core/ReduceSum.h"
 namespace nts { // namespace nt(NiuTrans.Tensor)
-                /* case 1 */
-    bool TestReduceMean1()
+/* case 1: get the mean value along a dimension of the tensor */
-    {
+bool TestReduceMean1()
-        /* a tensor of size 2 * 4 */
+{
-        int order = 2;
+    /* a tensor of size (2, 4) */
-        int order_reduce = 1;
+    int sOrder = 2;
-        int * dimSize = new int[order];
+    int * sDimSize = new int[sOrder];
-        dimSize[0] = 2;
+    sDimSize[0] = 2;
-        dimSize[1] = 4;
+    sDimSize[1] = 4;
-        int unitNum = 1;
+    int sUnitNum = 1;
-        for (int i = 0; i < order; i++)
+    for (int i = 0; i < sOrder; i++)
-            unitNum *= dimSize[i];
+        sUnitNum *= sDimSize[i];
-        /* a tensor of size 4 */
-        int * dimSize_reduce_a = new int[order_reduce];
+    /* a tensor of size (4) */
-        dimSize_reduce_a[0] = 4;
+    int tOrder1 = 1;
+    int * tDimSize1 = new int[tOrder1];
-        int unitNum_a = 1;
+    tDimSize1[0] = 4;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_a *= dimSize_reduce_a[i];
+    int tUnitNum1 = 1;
-        /* a tensor of size 2 */
+    for (int i = 0; i < tOrder1; i++)
-        int * dimSize_reduce_b = new int[order_reduce];
+        tUnitNum1 *= tDimSize1[i];
-        dimSize_reduce_b[0] = 2;
+    /* a tensor of size (2) */
-        int unitNum_b = 1;
+    int tOrder2 = 1;
-        for (int i = 0; i < order_reduce; i++)
+    int * tDimSize2 = new int[tOrder2];
-            unitNum_b *= dimSize_reduce_b[i];
+    tDimSize2[0] = 2;
+    int tUnitNum2 = 1;
-        DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    for (int i = 0; i < tOrder2; i++)
-                              { 4.0,   5.0,   6.0,   7.0 } };
+        tUnitNum2 *= tDimSize2[i];
-        DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
-                              { -7.0, -9.0, -11.0, -13.0 } };
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-        DTYPE answer_a[4] = { 2.0,  3.0,  4.0,  5.0 };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-        DTYPE answer_b[2] = { -2.0,  -10.0 };
+    DTYPE answer1[4] = {2.0F, 3.0F, 4.0F, 5.0F};
+    DTYPE answer2[2] = {1.5F, 5.5F};
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-        a->SetData(aData, unitNum);
+    s->SetData(sData, sUnitNum);
-        b->SetData(bData, unitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
-        /* call reduce mean function */
-        ReduceMean(a, reduce_a, 0);
-        ReduceMean(b, reduce_b, 1);
-        //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
+    /* call ReduceMean function */
-        //for (int i = 0; i < unitNum_a; i++)
+    ReduceMean(s, t1, 0);
-        //    printf("%f ", *reduce_a_data++);
+    ReduceMean(s, t2, 1);
-        //printf("\n");
-        //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-        //for (int i = 0; i < unitNum_b; i++)
-        //    printf("%f ", *reduce_b_data++);
    /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
    /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
    /* Initialize variables */
-        aGPU->SetData(aData, unitNum);
+    sGPU->SetData(sData, sUnitNum);
-        bGPU->SetData(bData, unitNum);
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
-        /* call reduce mean function */
+    /* call ReduceMean function */
-        ReduceMean(aGPU, reduce_aGPU, 0);
+    ReduceMean(sGPU, tGPU1, 0);
-        ReduceMean(bGPU, reduce_bGPU, 1);
+    ReduceMean(sGPU, tGPU2, 1);
    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-        delete dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
-        return cpuTest && gpuTest;
+    delete t2;
-#else
+    delete sGPU;
-        /* destroy variables */
+    delete tGPU1;
-        delete a;
+    delete tGPU2;
-        delete b;
+    delete[] sDimSize;
-        return cpuTest;
+    delete[] tDimSize1;
-#endif // USE_CUDA
+    delete[] tDimSize2;
-    }
-    bool TestReduceMeanForLargescale()
-    {
-        /* a tensor of size 10000 * 500 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 10000;
-        dimSize[1] = 500;
-        int unitNum = 1;
-        for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
-        /* a tensor of size 500 */
-        int * dimSize_reduce_a = new int[order_reduce];
-        dimSize_reduce_a[0] = 500;
-        int unitNum_a = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_a *= dimSize_reduce_a[i];
-        /* a tensor of size 10000 */
-        int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 10000;
-        int unitNum_b = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
-        DTYPE * data = new DTYPE[5000000];
-        DTYPE * tmp = data;
-        for (int i = 0; i < unitNum; i++)
-            *tmp++ = 1;
-        DTYPE answer_a[500];
-        for (int i = 0; i < unitNum_a; i++)
-            answer_a[i] = 1;
-        DTYPE answer_b[10000];
-        for (int i = 0; i < unitNum_b; i++)
-            answer_b[i] = 1;
-        /* CPU test */
-        bool cpuTest = true;
-        /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
-        XTensor * b = NewTensor(order, dimSize);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
-        /* initialize variables */
-        a->SetData(data, unitNum);
-        b->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceMean(a, reduce_a, 0);
-        ReduceMean(b, reduce_b, 1);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-#ifdef USE_CUDA
-        /* GPU test */
-        bool gpuTest = true;
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
-        aGPU->SetData(data, unitNum);
-        bGPU->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceMean(aGPU, reduce_aGPU, 0);
-        ReduceMean(bGPU, reduce_bGPU, 1);
-        /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-        /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-        delete a;
+    delete s;
-        delete b;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
-    }
+}
-    /* other cases */
-    /*
+/* other cases */
-    TODO!!
+/*
-    */
+TODO!!
+*/
-    /* test for Sum Function */
+/* test for ReduceMean Function */
-    extern "C"
+bool TestReduceMean()
-        bool TestReduceMean()
+{
-    {
+    XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n");
-        XPRINT(0, stdout, "[TEST ReduceMean]\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -235,14 +145,6 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-        /* case 2 test */
-        caseFlag = TestReduceMeanForLargescale();
-        if (!caseFlag) {
-            returnFlag = false;
-            XPRINT(0, stdout, ">> case 2 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 2 passed!\n");
    ///* other cases test */
    ///*
    //TODO!!
@@ -257,6 +159,6 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    XPRINT(0, stdout, "\n");
    return returnFlag;
-    }
+}
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
@@ -22,15 +22,15 @@
 #ifndef __TEST_REDUCEMEAN_H__
 #define __TEST_REDUCEMEAN_H__
-#include "../core/ReduceMean.h"
+#include "../core/reduce/ReduceMean.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
 /* test for ReduceMean Function */
 extern "C"
 bool TestReduceMean();
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCEMEAN_H__
--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
@@ -19,210 +19,126 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TReduceSum.h"
-#include "../XDevice.h"
-#include "../core/ReduceMean.h"
-#include "../core/ReduceMax.h"
-#include "../core/ReduceSum.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
-                /* case 1 */
-    bool TestReduceSum1()
-    {
-        /* a tensor of size 2 * 4 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 2;
-        dimSize[1] = 4;
-        int unitNum = 1;
+/* 
-        for (int i = 0; i < order; i++)
+case 1: sum the items along a dimension of the tensor.
-            unitNum *= dimSize[i];
+In this case, 
-        /* a tensor of size 4 */
+(2, 4) -> (4), dim = 0
-        int * dimSize_reduce_a = new int[order_reduce];
+(2, 4) -> (2), dim = 1
-        dimSize_reduce_a[0] = 4;
+*/
+bool TestReduceSum1()
-        int unitNum_a = 1;
+{
-        for (int i = 0; i < order_reduce; i++)
+    /* a tensor of size (2, 4) */
-            unitNum_a *= dimSize_reduce_a[i];
+    int sOrder = 2;
-        /* a tensor of size 2 */
+    int * sDimSize = new int[sOrder];
-        int * dimSize_reduce_b = new int[order_reduce];
+    sDimSize[0] = 2;
-        dimSize_reduce_b[0] = 2;
+    sDimSize[1] = 4;
-        int unitNum_b = 1;
+    int sUnitNum = 1;
-        for (int i = 0; i < order_reduce; i++)
+    for (int i = 0; i < sOrder; i++)
-            unitNum_b *= dimSize_reduce_b[i];
+        sUnitNum *= sDimSize[i];
-        DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    /* a tensor of size (4) */
-                              { 4.0,   5.0,   6.0,   7.0 } };
+    int tOrder1 = 1;
-        DTYPE bData[2][4] = { { 1.0,  -1.0,  -3.0,  -5.0 },
+    int * tDimSize1 = new int[tOrder1];
-                              { -7.0, -9.0, -11.0, -13.0 } };
+    tDimSize1[0] = 4;
-        DTYPE answer_a[4] = { 4.0,  6.0,  8.0,  10.0 };
-        DTYPE answer_b[2] = { -8.0,  -40.0 };
+    int tUnitNum1 = 1;
+    for (int i = 0; i < tOrder1; i++)
+        tUnitNum1 *= tDimSize1[i];
+    /* a tensor of size (2) */
+    int tOrder2 = 1;
+    int * tDimSize2 = new int[tOrder2];
+    tDimSize2[0] = 2;
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
+    DTYPE answer2[2] = {6.0F, 22.0F};
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * s = NewTensor(sOrder, sDimSize);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-        a->SetData(aData, unitNum);
+    s->SetData(sData, sUnitNum);
-        b->SetData(bData, unitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
-        /* call reduce sum function */
-        ReduceSum(a, reduce_a, 0);
-        ReduceSum(b, reduce_b, 1);
-        //DTYPE* reduce_a_data = (DTYPE*)reduce_a->data;
+    /* call ReduceSum function */
-        //for (int i = 0; i < unitNum_a; i++)
+    ReduceSum(s, t1, 0);
-        //    printf("%f ", *reduce_a_data++);
+    ReduceSum(s, t2, 1);
-        //printf("\n");
-        //DTYPE* reduce_b_data = (DTYPE*)reduce_b->data;
-        //for (int i = 0; i < unitNum_b; i++)
-        //    printf("%f ", *reduce_b_data++);
    /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT, 1.0F, 0);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT, 1.0F, 0);
-        /* Initialize variables */
-        aGPU->SetData(aData, unitNum);
-        bGPU->SetData(bData, unitNum);
-        /* call reduce sum function */
-        ReduceSum(aGPU, reduce_aGPU, 0);
-        ReduceSum(bGPU, reduce_bGPU, 1);
-        /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-        /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
-        return cpuTest && gpuTest;
-#else
-        /* destroy variables */
-        delete a;
-        delete b;
-        return cpuTest;
-#endif // USE_CUDA
-    }
-    bool TestReduceSumForLargescale()
-    {
-        /* a tensor of size 10000 * 500 */
-        int order = 2;
-        int order_reduce = 1;
-        int * dimSize = new int[order];
-        dimSize[0] = 10000;
-        dimSize[1] = 500;
-        int unitNum = 1;
-        for (int i = 0; i < order; i++)
-            unitNum *= dimSize[i];
-        /* a tensor of size 500 */
-        int * dimSize_reduce_a = new int[order_reduce];
-        dimSize_reduce_a[0] = 500;
-        int unitNum_a = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_a *= dimSize_reduce_a[i];
-        /* a tensor of size 10000 */
-        int * dimSize_reduce_b = new int[order_reduce];
-        dimSize_reduce_b[0] = 10000;
-        int unitNum_b = 1;
-        for (int i = 0; i < order_reduce; i++)
-            unitNum_b *= dimSize_reduce_b[i];
-        DTYPE * data = new DTYPE[5000000];
-        DTYPE * tmp = data;
-        for (int i = 0; i < unitNum; i++)
-            *tmp++ = 1;
-        DTYPE answer_a[500];
-        for (int i = 0; i < unitNum_a; i++)
-            answer_a[i] = 10000;
-        DTYPE answer_b[10000];
-        for (int i = 0; i < unitNum_b; i++)
-            answer_b[i] = 500;
-        /* CPU test */
-        bool cpuTest = true;
    /* create tensors */
-        XTensor * a = NewTensor(order, dimSize);
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
-        XTensor * b = NewTensor(order, dimSize);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
-        XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
    /* initialize variables */
-        a->SetData(data, unitNum);
+    sGPU->SetData(sData, sUnitNum);
-        b->SetData(data, unitNum);
+    tGPU1->SetZeroAll();
-        /* call reduce sum function */
+    tGPU2->SetZeroAll();
-        ReduceSum(a, reduce_a, 0);
-        ReduceSum(b, reduce_b, 1);
-        /* check results */
-        cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-#ifdef USE_CUDA
+    /* call ReduceSum function */
-        /* GPU test */
+    ReduceSum(sGPU, tGPU1, 0);
-        bool gpuTest = true;
+    ReduceSum(sGPU, tGPU2, 1);
-        /* create tensor */
-        XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-        XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-        XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-        /* Initialize variables */
-        aGPU->SetData(data, unitNum);
-        bGPU->SetData(data, unitNum);
-        /* call reduce max function */
-        ReduceSum(aGPU, reduce_aGPU, 0);
-        ReduceSum(bGPU, reduce_bGPU, 1);
    /* check results */
-        gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
+    cpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
    /* destroy variables */
-        delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
+    delete s;
-        delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-        delete a;
+    delete s;
-        delete b;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
    return cpuTest;
 #endif // USE_CUDA
-    }
+}
-    /* other cases */
+/* other cases */
-    /*
+/*
-    TODO!!
+TODO!!
-    */
+*/
-    /* test for ReduceSum Function */
+/* test for ReduceSum Function */
-    extern "C"
+bool TestReduceSum()
-        bool TestReduceSum()
+{
-    {
+    XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n");
-        XPRINT(0, stdout, "[TEST ReduceSum]\n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -234,19 +150,10 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");
-        /* case 2 test */
+    /* other cases test */
-        caseFlag = TestReduceSumForLargescale();
+    /*
-        if (!caseFlag) {
+    TODO!!
-            returnFlag = false;
+    */
-            XPRINT(0, stdout, ">> case 2 failed!\n");
-        }
-        else
-            XPRINT(0, stdout, ">> case 2 passed!\n");
-        ///* other cases test */
-        ///*
-        //TODO!!
-        //*/
    if (returnFlag) {
        XPRINT(0, stdout, ">> All Passed!\n");
@@ -259,4 +166,4 @@ namespace nts { // namespace nt(NiuTrans.Tensor)
    return returnFlag;
    }
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
@@ -22,15 +22,15 @@
 #ifndef __TEST_REDUCESUM_H__
 #define __TEST_REDUCESUM_H__
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)
-                /* test for ReduceSum Function */
+/* test for ReduceSum Function */
-    extern "C"
+extern "C"
-        bool TestReduceSum();
+bool TestReduceSum();
-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCESUM_H__

--- a/source/test/TReduceSumSquared.cpp
+++ b/source/test/TReduceSumSquared.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TReduceSumSquared.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: squared sum of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+In this case, (2, 4) -> (4), dim = 0.
+*/
+bool TestReduceSumSquared1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (4) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 4;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* a shift tensor of size (4) */
+    int shiftOrder = 1;
+    int * shiftDimSize = new int[shiftOrder];
+    shiftDimSize[0] = 4;
+    int shiftUnitNum = 1;
+    for (int i = 0; i < shiftOrder; i++)
+        shiftUnitNum *= shiftDimSize[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE shiftData[4] = {1.0F, -1.0F, -1.0F, 0.0F};
+    DTYPE answer[4] = {10.0F, 40.0F, 58.0F, 58.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    shift->SetData(shiftData, shiftUnitNum);
+    t->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(s, t, 0, shift);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    shiftGPU->SetData(shiftData, shiftUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(sGPU, tGPU, 0, shiftGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete sGPU;
+    delete tGPU;
+    delete shiftGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: squared sum of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+In this case, (2, 4) -> (2), dim = 1.
+*/
+bool TestReduceSumSquared2()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (2) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* a shift tensor of size (2) */
+    int shiftOrder = 1;
+    int * shiftDimSize = new int[shiftOrder];
+    shiftDimSize[0] = 2;
+    int shiftUnitNum = 1;
+    for (int i = 0; i < shiftOrder; i++)
+        shiftUnitNum *= shiftDimSize[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE shiftData[2] = {-1.0F, 1.0F};
+    DTYPE answer[2] = {30.0F, 86.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * shift = NewTensor(shiftOrder, shiftDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    shift->SetData(shiftData, shiftUnitNum);
+    t->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(s, t, 1, shift);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * shiftGPU = NewTensor(shiftOrder, shiftDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    shiftGPU->SetData(shiftData, shiftUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceSumSquared function */
+    ReduceSumSquared(sGPU, tGPU, 1, shiftGPU);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete sGPU;
+    delete tGPU;
+    delete shiftGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete shift;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] shiftDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for ReduceSumSquared Function */
+bool TestReduceSumSquared()
+{
+    XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestReduceSumSquared1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestReduceSumSquared2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceSumSquared.h
+++ b/source/test/TReduceSumSquared.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_REDUCESUMSQUARED_H__
+#define __TEST_REDUCESUMSQUARED_H__
+#include "../core/reduce/ReduceSumSquared.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for ReduceSumSquared Function */
+extern "C"
+bool TestReduceSumSquared();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_REDUCESUMSQUARED_H__
--- a/source/test/TReduceVariance.cpp
+++ b/source/test/TReduceVariance.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TReduceVariance.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/*
+case 1: variance of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
+In this case, (2, 4) -> (4), dim = 0.
+*/
+bool TestReduceVariance1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (4) */
+    int tOrder = 1;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 4;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    /* a mean tensor of size (4) */
+    int meanOrder = 1;
+    int * meanDimSize = new int[meanOrder];
+    meanDimSize[0] = 4;
+    int meanUnitNum = 1;
+    for (int i = 0; i < meanOrder; i++)
+        meanUnitNum *= meanDimSize[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE meanData[4] = {2.0F, 3.0F, 4.0F, 5.0F};
+    DTYPE answer[4] = {4.0F, 4.0F, 4.0F, 4.0F};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * mean = NewTensor(meanOrder, meanDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    mean->SetData(meanData, meanUnitNum);
+    t->SetZeroAll();
+    /* call ReduceVariance function */
+    ReduceVariance(s, t, 0, mean);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * meanGPU = NewTensor(meanOrder, meanDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    meanGPU->SetData(meanData, meanUnitNum);
+    tGPU->SetZeroAll();
+    /* call ReduceVariance function */
+    ReduceVariance(sGPU, tGPU, 0, meanGPU);
+    /* check results */
+    gpuTest = t->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete mean;
+    delete sGPU;
+    delete tGPU;
+    delete meanGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete mean;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    delete[] meanDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for ReduceVariance Function */
+bool TestReduceVariance()
+{
+    XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestReduceVariance1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TReduceVariance.h
+++ b/source/test/TReduceVariance.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_REDUCEVARIANCE_H__
+#define __TEST_REDUCEVARIANCE_H__
+#include "../core/reduce/ReduceVariance.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for ReduceVariance Function */
+extern "C"
+bool TestReduceVariance();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_REDUCEVARIANCE_H__
--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TScaleAndShift.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: scale and shift all tensor entires.
+p = p * scale + shift
+*/
+bool TestScaleAndShift1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
+                           {8.5F, 10.5F, 12.5F, 14.5F} };
+    DTYPE scaleFactor = 2.0F;
+    DTYPE shiftFactor = 0.5F;
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    /* call ScaleAndShift function */
+    ScaleAndShift(s, scaleFactor, shiftFactor);
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    /* call ScaleAndShift function */
+    ScaleAndShift(sGPU, scaleFactor, shiftFactor);
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum);
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for ScaleAndShift Function */
+bool TestScaleAndShift()
+{
+    XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestScaleAndShift1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TScaleAndShift.h
+++ b/source/test/TScaleAndShift.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_SCALEANDSHIFT_H__
+#define __TEST_SCALEANDSHIFT_H__
+#include "../core/math/ScaleAndShift.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for ScaleAndShift Function */
+extern "C"
+bool TestScaleAndShift();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SCALEANDSHIFT_H__
--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-04
+*/
+#include "TSelect.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: test SelectRange function.
+It can generate a tensor with seleccted data in range[low,high] along the given dimension.
+In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
+*/
+bool TestSelect1()
+{
+    /* a input tensor of size (2, 2, 4) */
+    int sOrder = 3;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 2;
+    sDimSize[2] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (2, 2, 2) */
+    int tOrder = 3;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    tDimSize[2] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[2][2][4] = { { {0.0F, 1.0F, 2.0F, 3.0F},
+                               {4.0F, 5.0F, 6.0F, 7.0F} },
+                             { {1.0F, 2.0F, 3.0F, 4.0F},
+                               {5.0F, 6.0F, 7.0F, 8.0F} } };
+    DTYPE answer[2][2][2] = { { {1.0F, 2.0F},
+                                {5.0F, 6.0F} },
+                              { {2.0F, 3.0F},
+                                {6.0F, 7.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    /* call SelectRange function */
+    SelectRange(s, t, 2, 1, 3);
+    /* check results */
+    cpuTest = t->CheckData(answer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    /* call Select function */
+    SelectRange(sGPU, tGPU, 2, 1, 3);
+    /* check results */
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete sGPU;
+    delete tGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for Select Function */
+bool TestSelect()
+{
+    XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSelect1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSelect.h
+++ b/source/test/TSelect.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-04
+*/
+#ifndef __TEST_SELECT_H__
+#define __TEST_SELECT_H__
+#include "../core/getandset/Select.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Select Function */
+extern "C"
+bool TestSelect();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SELECT_H__
--- a/source/test/TSetAscendingOrder.cpp
+++ b/source/test/TSetAscendingOrder.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSetAscendingOrder.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: set the cell to the ascending order along a given dimension.
+*/
+bool TestSetAscendingOrder1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    int answer[2][4] = { {0, 1, 2, 3},
+                         {0, 1, 2, 3} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize, X_INT);
+    /* initialize variables */
+    s->SetZeroAll();
+    /* call SetAscendingOrder function */
+    s->SetAscendingOrder(1);
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetZeroAll();
+    /* call SetAscendingOrder function */
+    sGPU->SetAscendingOrder(1);
+    /* check results */
+    gpuTest = sGPU->CheckData(answer, sUnitNum);
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for SetAscendingOrder Function */
+bool TestSetAscendingOrder()
+{
+    XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSetAscendingOrder1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSetAscendingOrder.h
+++ b/source/test/TSetAscendingOrder.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SETASCENDINGORDER_H__
+#define __TEST_SETASCENDINGORDER_H__
+#include "../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SetAscendingOrder Function */
+extern "C"
+bool TestSetAscendingOrder();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SETASCENDINGORDER_H__
--- a/source/test/TSetData.cpp
+++ b/source/test/TSetData.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSetData.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* case 1: set the cell to the ascending order along a given dimension. */
+bool TestSetData1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    DTYPE answer[2][4] = {0};
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    /* call SetData function */
+    s->SetDataRand(0.0, 1.0);
+    /* check results */
+    cpuTest = s->CheckData(answer, sUnitNum, 1.0F);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    /* call SetDataRand function */
+    sGPU->SetDataRand(0.0, 1.0);
+    gpuTest = sGPU->CheckData(answer, sUnitNum, 1.0F);
+    /* destroy variables */
+    delete s;
+    delete sGPU;
+    delete[] sDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete[] sDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for SetData Function */
+bool TestSetData()
+{
+    XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSetData1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSetData.h
+++ b/source/test/TSetData.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SETDATA_H__
+#define __TEST_SETDATA_H__
+#include "../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SetData Function */
+extern "C"
+bool TestSetData();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SETDATA_H__
--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
@@ -19,15 +19,15 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-19
 */
-#include "../XTensor.h"
 #include "../XUtility.h"
 #include "TSigmoid.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test Sigmoid function and SigmoidBackward function.
-* sigmoid function: y = 1/(1+exp(-x))
+/* 
-* backward computation: dE/ds = dE/dy * dy/dx
+case 1: test Sigmoid function and SigmoidBackward function.
+sigmoid function: y = 1/(1+exp(-x))
+backward computation: dE/ds = dE/dy * dy/dx
 */
 bool TestSigmoid1()
 {
@@ -42,7 +42,9 @@ bool TestSigmoid1()
    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE answer[3];
+    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
+    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
+    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
    /* CPU test */
    bool cpuTest = true;
@@ -57,41 +59,18 @@ bool TestSigmoid1()
    /* initialize variables */
    x->SetData(xData, sUnitNum);
    g->SetData(gData, sUnitNum);
+    dedy->SetData(dedyData, sUnitNum);
    y->SetZeroAll();
    dedx->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(x, y);
-    /* cross_entropy: de/dy_i = -t_i / y_i */
-    DTYPE dedyData[3];
-    DTYPE * yProcessedData = (DTYPE*)y->data;
-	for (int i = 0; i < sUnitNum; i++)
-		dedyData[i] = - gData[i] / yProcessedData[i];
-    /* initialize variables */
-    dedy->SetData(dedyData, sUnitNum);
-	for (int i = 0; i < sUnitNum; i++)
-		answer[i] = dedyData[i] * yProcessedData[i] * (1 - yProcessedData[i]);
    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, NOLOSS);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    DTYPE * checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -107,66 +86,50 @@ bool TestSigmoid1()
    /* initialize variables */
    xGPU->SetData(xData, sUnitNum);
    gGPU->SetData(gData, sUnitNum);
+    dedyGPU->SetData(dedyData, sUnitNum);
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(xGPU, yGPU);
-    /* cross_entropy: de/dy_i = -t_i / y_i */
-    void * yProcessedDataGPU = (DTYPE*)yGPU->data;
-    int size = sUnitNum * yGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, yProcessedDataGPU, yGPU->devID, size);
-	for (int i = 0; i < sUnitNum; i++) {
-		dedyData[i] = - gData[i] / *copy++;
-    }
-    /* initialize variables */
-    dedyGPU->SetData(dedyData, sUnitNum);
-	for (int i = 0; i < sUnitNum; i++)
-		answer[i] = dedyData[i] * yProcessedData[i] * (1 - yProcessedData[i]);
    /* call SigmoidBackward function */
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, NOLOSS);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)dedxGPU->data;
-    size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy1 = new DTYPE[size];
-    XMemCopy(copy1, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy1[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2: test Sigmoid function and SigmoidBackward function.
+/* 
-* sigmoid function: y = 1/(1+exp(-x))
+case 2: test Sigmoid function and SigmoidBackward function.
-* backward computation: dE/ds = dE/dy * dy/dx
+sigmoid function: y = 1/(1+exp(-x))
+backward computation: dE/ds = dE/dy * dy/dx
 */
 bool TestSigmoid2()
 {
@@ -181,7 +144,9 @@ bool TestSigmoid2()
    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
    DTYPE gData[3] = {0.4F, 0.8F, 1.0F};
-    DTYPE answer[3] = {0.0F, 0.0F, 0.0F};
+    DTYPE dedyData[3] = {-0.8F, -1.094F, -1.135F};
+    DTYPE yAnswer[3] = {0.5F, 0.731F, 0.881F};
+    DTYPE dedxAnswer[3] = {-0.2F, -0.215F, -0.119F};
    /* CPU test */
    bool cpuTest = true;
@@ -196,29 +161,21 @@ bool TestSigmoid2()
    /* initialize variables */
    x->SetData(xData, sUnitNum);
    g->SetData(gData, sUnitNum);
+    dedy->SetZeroAll();
    y->SetZeroAll();
    dedx->SetZeroAll();
    /* call Sigmoid function */
    Sigmoid(x, y);
+    /* initialize variables */
+    dedy->SetData(dedyData, sUnitNum);
    /* call SigmoidBackward function */
    SigmoidBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    DTYPE * checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -234,6 +191,7 @@ bool TestSigmoid2()
    /* initialize variables */
    xGPU->SetData(xData, sUnitNum);
    gGPU->SetData(gData, sUnitNum);
+    dedyGPU->SetZeroAll();
    yGPU->SetZeroAll();
    dedxGPU->SetZeroAll();
@@ -244,32 +202,29 @@ bool TestSigmoid2()
    SigmoidBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)dedxGPU->data;
-    int size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy1 = new DTYPE[size];
-    XMemCopy(copy1, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy1[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -282,10 +237,9 @@ bool TestSigmoid2()
 */
 /* test for Sigmoid Function */
-extern "C"
 bool TestSigmoid()
 {
-    XPRINT(0, stdout, "[TEST SIGMOID] -------------\n");
+    XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
@@ -24,8 +24,10 @@
 #include "TSoftmax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test Softmax function.
-* softmax function: y = e^x / \sum_{i} e^{x_i}
+/* 
+case 1: test Softmax function.
+softmax function: y = e^x / \sum_{i} e^{x_i}
 */
 bool TestSoftmax1()
 {
@@ -59,21 +61,7 @@ bool TestSoftmax1()
    Softmax(x, y, 1);
    /* check result */
-    printf("CPU Test:\n");
+    cpuTest = y->CheckData(answer, sUnitNum);
-    printf("Softmax Result:");
-    DTYPE * checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real Result:");
-    for (int i = 0; i < sDimSize[0]; i++) {
-        for (int j = 0; j < sDimSize[1]; j++) {
-            printf("\t%f", answer[i][j]);
-        }
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -91,28 +79,13 @@ bool TestSoftmax1()
    Softmax(xGPU, yGPU, 1);
    /* check result */
-    printf("\nGPU Test:\n");
+    gpuTest = yGPU->CheckData(answer, sUnitNum);
-    printf("Computer de/dx:");
-    checkData = (DTYPE*)yGPU->data;
-    int size = sUnitNum * yGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, checkData, yGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy[i]);
-    }
-    printf("\n");
-    printf("Real Result:");
-    for (int i = 0; i < sDimSize[0]; i++) {
-        for (int j = 0; j < sDimSize[1]; j++) {
-            printf("\t%f", answer[i][j]);
-        }
-    }
-    printf("\n");
    /* destroy variables */
-    delete x, y;
+    delete x;
-    delete xGPU, yGPU;
+    delete y;
+    delete xGPU;
+    delete yGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
@@ -125,11 +98,13 @@ bool TestSoftmax1()
 #endif // USE_CUDA
 }
-/* case 2: test SoftmaxBackward function.
+/* 
-* SoftmaxBackward function: dE/dx_j = -gold_j + y_j
+case 2: test SoftmaxBackward function.
+SoftmaxBackward function: dE/dx_j = -gold_j + y_j
 */
 bool TestSoftmax2()
 {
+    /* a input tensor of size (2, 3) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 1;
@@ -141,7 +116,7 @@ bool TestSoftmax2()
    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
-    DTYPE answer[3] = {0.090031F, 0.244728F, -0.334759F};
+    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};
    /* CPU test */
    bool cpuTest = true;
@@ -163,31 +138,10 @@ bool TestSoftmax2()
    /* call Softmax function */
    Softmax(x, y, 1);
-    /* check result */
-    printf("CPU Test:\n");
-    printf("Softmax Result:");
-    DTYPE * checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    /* call SoftmaxBackward function */
    SoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedx->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
 #ifdef USE_CUDA
    /* GPU test */
@@ -210,44 +164,33 @@ bool TestSoftmax2()
    /* call Softmax function */
    Softmax(xGPU, yGPU, 1);
-    /* check result */
-    printf("\nGPU Test:\n");
-    printf("Softmax Result:");
-    checkData = (DTYPE*)y->data;
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", checkData[i]);
-    }
-    printf("\n");
    /* call SoftmaxBackward function */
    SoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
    /* check result */
-    printf("Computer de/dx:");
+    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
-    checkData = (DTYPE*)dedxGPU->data;
-    int size = sUnitNum * dedxGPU->unitSize;
-    DTYPE * copy = new DTYPE[size];
-    XMemCopy(copy, -1, checkData, dedxGPU->devID, size);
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", copy[i]);
-    }
-    printf("\n");
-    printf("Real de/dx:");
-    for (int i = 0; i < sUnitNum; i++) {
-        printf("\t%f", answer[i]);
-    }
-    printf("\n");
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
-    delete xGPU, yGPU, gGPU, dedxGPU, dedyGPU;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
+    delete xGPU;
+    delete yGPU;
+    delete gGPU;
+    delete dedxGPU;
+    delete dedyGPU;
    delete[] sDimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
-    delete x, y, g, dedx, dedy;
+    delete x;
+    delete y;
+    delete g;
+    delete dedx;
+    delete dedy;
    delete[] sDimSize;
    return cpuTest;
@@ -260,10 +203,9 @@ bool TestSoftmax2()
 */
 /* test for Softmax Function */
-extern "C"
 bool TestSoftmax()
 {
-    XPRINT(0, stdout, "[TEST SOFTMAX] -------------\n");
+    XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
@@ -19,15 +19,14 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TSort.h"
-#include "../XDevice.h"
-#include "../core/Sort.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: sort the tensor along a given dimension */
 bool TestSort1()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -37,33 +36,25 @@ bool TestSort1()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
-    DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          { 4.0,   5.0,   6.0,   7.0 } };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE answer[2][4] = { { 4.0,   5.0,  6.0,  7.0 },
+    DTYPE answer[2][4] = { {4.0F, 5.0F, 6.0F, 7.0F},
-                           { 0.0,   1.0,  2.0,  3.0 } };
+                           {0.0F, 1.0F, 2.0F, 3.0F} };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize, X_INT);
-    b->dataType = X_INT;
    /* initialize variables */
    a->SetData(aData, unitNum);
+    b->SetZeroAll();
-    /* call sort function */
+    /* call Sort function */
    Sort(a, b, 0);
-    DTYPE* check1 = (DTYPE*)a->data;
-    for (int i = 0; i < 8; i++)
-        printf("%f ", *check1++);
-    printf("\n");
-    int* check2 = (int*)b->data;
-    for (int i = 0; i < 8; i++)
-        printf("%d ", *check2++);
-    printf("\n");
-    /* check results */
    cpuTest = a->CheckData(answer, unitNum);
 #ifdef USE_CUDA
@@ -71,11 +62,12 @@ bool TestSort1()
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * bGPU = NewTensor(order, dimSize, X_INT, 1.0F, 0);
-    bGPU->dataType = X_INT;
    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
+    bGPU->SetZeroAll();
    /* call sum function */
    Sort(aGPU, bGPU, 0);
@@ -84,21 +76,26 @@ bool TestSort1()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
    delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 bool TestSort2()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -108,32 +105,24 @@ bool TestSort2()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
-    DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          { 4.0,   5.0,   6.0,   7.0 } };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE answer[2][4] = { { 3.0,   2.0,   1.0,   0.0 },
+    DTYPE answer[2][4] = { {3.0F, 2.0F, 1.0F, 0.0F},
-                           { 7.0,   6.0,   5.0,   4.0 } };
+                           {7.0F, 6.0F, 5.0F, 4.0F} };
    /* CPU test */
    bool cpuTest = true;
    /* create tensors */
    XTensor * a = NewTensor(order, dimSize);
-    XTensor * b = NewTensor(order, dimSize);
+    XTensor * b = NewTensor(order, dimSize, X_INT);
-    b->dataType = X_INT;
    /* initialize variables */
    a->SetData(aData, unitNum);
-    /* call sort function */
+    /* call Sort function */
    Sort(a, b, 1);
-    DTYPE* check1 = (DTYPE*)a->data;
-    for (int i = 0; i < 8; i++)
-        printf("%f ", *check1++);
-    printf("\n");
-    int* check2 = (int*)b->data;
-    for (int i = 0; i < 8; i++)
-        printf("%d ", *check2++);
-    printf("\n");
    /* check results */
    cpuTest = a->CheckData(answer, unitNum);
@@ -142,9 +131,9 @@ bool TestSort2()
    bool gpuTest = true;
    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT, 1.0F, 0);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
+    XTensor * bGPU = NewTensor(order, dimSize, X_INT, 1.0F, 0);
-    bGPU->dataType = X_INT;
    /* Initialize variables */
    aGPU->SetData(aData, unitNum);
@@ -155,27 +144,32 @@ bool TestSort2()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
    delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
 /* other cases */
 /*
 TODO!!
 */
-/* test for Sum Function */
+/* test for Sort Function */
-extern "C"
 bool TestSort()
 {
-    XPRINT(0, stdout, "[TEST SORT]\n");
+    XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */
@@ -195,6 +189,7 @@ bool TestSort()
    }
    else
        XPRINT(0, stdout, ">> case 2 passed!\n");
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TSort.h
+++ b/source/test/TSort.h
@@ -22,13 +22,13 @@
 #ifndef __TEST_SORT_H__
 #define __TEST_SORT_H__
-#include "../core/Sort.h"
+#include "../core/sort/Sort.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* test for Sum Function */
+/* test for Sort Function */
 extern "C"
 bool TestSort();
 } // namespace nts(NiuTrans.Tensor)
-#endif // __TEST_SUM_H__
+#endif // __TEST_SORT_H__
--- a/source/test/TSplit.cpp
+++ b/source/test/TSplit.cpp
@@ -19,18 +19,17 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-13
 */
-#include "../XTensor.h"
+#include "TSplit.h"
-#include "../XDevice.h"
-#include "../core/Split.h"
-#include "../XList.h"
 namespace nts { // namespace nt(NiuTrans.Tensor)
-/* case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
-* In this case, 4 * 3 -> 2 * 2 * 3, whereToSplit=0, splitNum=2.
+/* 
+case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+In this case, (4, 3) -> (2, 2, 3), whereToSplit=0, splitNum=2.
 */
 bool TestSplit1()
 {
-    /* a source tensor of size 4 * 3 */
+    /* a source tensor of size (4, 3) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 4;
@@ -40,7 +39,7 @@ bool TestSplit1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a target tensor of size 2 * 2 * 3 */
+    /* a target tensor of size (2, 2, 3) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -109,12 +108,13 @@ bool TestSplit1()
 #endif // USE_CUDA
 }
-/* case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+/* 
-* In this case, 3 * 4 -> 2 * 3 * 2, whereToSplit=1, splitNum=2.
+case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+In this case, (3, 4) -> (2, 3, 2), whereToSplit=1, splitNum=2.
 */
 bool TestSplit2()
 {
-    /* a source tensor of size 3 * 4 */
+    /* a source tensor of size (3, 4) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 3;
@@ -124,7 +124,7 @@ bool TestSplit2()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a target tensor of size 2 * 3 * 2 */
+    /* a target tensor of size (2, 3, 2) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -194,8 +194,9 @@ bool TestSplit2()
 #endif // USE_CUDA
 }
-/* case 3: split a big tensor into small tensors
+/* 
-* In this case, 3 * 4 -> 2 * (3 * 2) , whereToSplit=1, splitNum=2.
+case 3: split a big tensor into small tensors
+In this case, (3, 4) -> 2 * (3, 2) , whereToSplit=1, splitNum=2.
 */
 bool TestSplit3()
 {
@@ -203,7 +204,7 @@ bool TestSplit3()
    XList tList;
    tList = XList();
-    /* a source tensor of size (3 * 4) */
+    /* a source tensor of size (3, 4) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 3;
@@ -213,7 +214,7 @@ bool TestSplit3()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];
-    /* a target tensor of size (3 * 2) */
+    /* a target tensor of size (3, 2) */
    int tOrder1 = 2;
    int * tDimSize1 = new int[tOrder1];
    tDimSize1[0] = 3;
@@ -313,10 +314,9 @@ TODO!!
 */
 /* test for Split Function */
-extern "C"
+bool TestSplit()
-    bool TestSplit()
 {
-    XPRINT(0, stdout, "[TEST SPLIT] -------------\n");
+    XPRINT(0, stdout, "[TEST SPLIT] split a big tensor into small tensors \n");
    bool returnFlag = true, caseFlag = true;
    /* case 1 test */

--- a/source/test/TSplit.h
+++ b/source/test/TSplit.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SPLIT_H__
 #define __TEST_SPLIT_H__
-#include "../core/Split.h"
+#include "../core/shape/Split.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
@@ -19,15 +19,14 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../XTensor.h"
+#include "TSum.h"
-#include "../XDevice.h"
-#include "../core/Sum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+/* case 1: tensor summation c = a + b * \beta */
 bool TestSum1()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -37,12 +36,12 @@ bool TestSum1()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
-    DTYPE aData[2][4] = { {0.0,   1.0,   2.0,   3.0},
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0,   5.0,   6.0,   7.0} };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][4] = { {1.0,  -1.0,  -3.0,  -5.0}, 
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
-                          {-7.0, -9.0, -11.0, -13.0} };
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
-    DTYPE answer[2][4] = { {1.0,   0.0,  -1.0,  -2.0},
+    DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
-                           {-3.0, -4.0,  -5.0,  -6.0} };
+                           {-3.0F, -4.0F, -5.0F, -6.0F} };
    /* CPU test */
    bool cpuTest = true;
@@ -80,22 +79,27 @@ bool TestSum1()
    gpuTest = aGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, aGPU, bGPU;
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
    delete a;
 	delete b;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
-/* case 2 */
+/* case 2: tensor summation c = a + b * \beta */
 bool TestSum2()
 {
-    /* a tensor of size 2 * 4 */
+    /* a tensor of size (2, 4) */
    int order = 2;
    int * dimSize = new int[order];
    dimSize[0] = 2;
@@ -105,12 +109,12 @@ bool TestSum2()
    for (int i = 0; i < order; i++) {
        unitNum *= dimSize[i];
    }
-    DTYPE aData[2][4] = { {0.0,   1.0,   2.0,   3.0},
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
-                          {4.0,   5.0,   6.0,   7.0} };
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
-    DTYPE bData[2][4] = { {1.0,  -1.0,  -3.0,  -5.0}, 
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
-                          {-7.0, -9.0, -11.0, -13.0} };
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
-    DTYPE answer[2][4] = { {0.5,  0.5,   0.5,   0.5},
+    DTYPE answer[2][4] = { {0.5F, 0.5F, 0.5F, 0.5F},
-                           {0.5,  0.5,   0.5,   0.5} };
+                           {0.5F, 0.5F, 0.5F, 0.5F} };
    float beta = 0.5F;
    /* CPU test */
@@ -126,7 +130,7 @@ bool TestSum2()
    b->SetData(bData, unitNum);
    c->SetZeroAll();
-    /* call sum function */
+    /* call Sum function */
    Sum(a, b, c, beta);
    /* check results */
@@ -146,15 +150,21 @@ bool TestSum2()
    bGPU->SetData(bData, unitNum);
    cGPU->SetZeroAll();
-    /* call sum function */
+    /* call Sum function */
    Sum(aGPU, bGPU, cGPU, beta);
    /* check results */
    gpuTest = cGPU->CheckData(answer, unitNum);
    /* destroy variables */
-    delete a, b, c, aGPU, bGPU, cGPU;
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
    delete[] dimSize;
    return cpuTest && gpuTest;
 #else
    /* destroy variables */
@@ -162,6 +172,7 @@ bool TestSum2()
    delete b;
    delete c;
    delete[] dimSize;
    return cpuTest;
 #endif // USE_CUDA
 }
@@ -172,7 +183,6 @@ bool TestSum2()
 */
 /* test for Sum Function */
-extern "C"
 bool TestSum()
 {
    XPRINT(0, stdout, "[TEST SUM] tensor summation c = a + b * beta\n");

--- a/source/test/TSum.h
+++ b/source/test/TSum.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SUM_H__
 #define __TEST_SUM_H__
-#include "../core/Sum.h"
+#include "../core/arithmetic/Sum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSumByColumnTV.cpp
+++ b/source/test/TSumByColumnTV.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSumByColumnTV.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: test SumByColumnTV function
+sum of a tensor and a vector (column vector) in a column by column manner
+*/
+bool TestSumByColumnTV1()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 1) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 1;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (2, 4) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 4;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(cOrder, cDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(a, b, c);
+    /* check results */
+    cpuTest = c->CheckData(answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+    /* call SumByColumnTV function */
+    SumByColumnTV(aGPU, bGPU, cGPU);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test SumByColumnTV function
+sum of a tensor and a vector (column vector) in a column by column manner
+*/
+bool TestSumByColumnTV2()
+{
+    /* a tensor of size (2, 4) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 4;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 1) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 1;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(a, b);
+    /* check results */
+    cpuTest = a->CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call SumByColumnTV function */
+    SumByColumnTV(aGPU, bGPU);
+    /* check results */
+    gpuTest = aGPU->CheckData(answer, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for SumByColumnTV Function */
+bool TestSumByColumnTV() 
+{
+    XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSumByColumnTV1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestSumByColumnTV2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+        TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSumByColumnTV.h
+++ b/source/test/TSumByColumnTV.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SUMBYCOLUMNTV_H__
+#define __TEST_SUMBYCOLUMNTV_H__
+#include "../core/arithmetic/SumByColumnTV.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SumByColumnTV Function */
+extern "C"
+bool TestSumByColumnTV();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUMBYCOLUMNTV_H__
--- a/source/test/TSumByColumnVT.cpp
+++ b/source/test/TSumByColumnVT.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#include "TSumByColumnVT.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: test SumByColumnVT function
+sum of a vector (column vector) and a tensor in a column by column manner
+*/
+bool TestSumByColumnVT1()
+{
+    /* a tensor of size (2, 1) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 1;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 4) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    /* a tensor of size (2, 1) */
+    int cOrder = 2;
+    int * cDimSize = new int[cOrder];
+    cDimSize[0] = 2;
+    cDimSize[1] = 1;
+    int cUnitNum = 1;
+    for (int i = 0; i < cOrder; i++)
+        cUnitNum *= cDimSize[i];
+    DTYPE aData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][1] = { {7.0F},
+                           {22.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    XTensor * c = NewTensor(cOrder, cDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    c->SetZeroAll();
+    /* call SumByColumnVT function */
+    SumByColumnVT(a, b, c);
+    /* check results */
+    cpuTest = c->CheckData(answer, cUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * cGPU = NewTensor(cOrder, cDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    cGPU->SetZeroAll();
+    /* call SumByColumnVT function */
+    SumByColumnVT(aGPU, bGPU, cGPU);
+    /* check results */
+    gpuTest = cGPU->CheckData(answer, cUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete aGPU;
+    delete bGPU;
+    delete cGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete c;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    delete[] cDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* 
+case 2: test SumByColumnVT function
+sum of a vector (column vector) and a tensor in a column by column manner
+*/
+bool TestSumByColumnVT2()
+{
+    /* a tensor of size (2, 1) */
+    int aOrder = 2;
+    int * aDimSize = new int[aOrder];
+    aDimSize[0] = 2;
+    aDimSize[1] = 1;
+    int aUnitNum = 1;
+    for (int i = 0; i < aOrder; i++)
+        aUnitNum *= aDimSize[i];
+    /* a tensor of size (2, 4) */
+    int bOrder = 2;
+    int * bDimSize = new int[bOrder];
+    bDimSize[0] = 2;
+    bDimSize[1] = 4;
+    int bUnitNum = 1;
+    for (int i = 0; i < bOrder; i++)
+        bUnitNum *= bDimSize[i];
+    DTYPE aData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][1] = { {7.0F},
+                           {22.0F} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * a = NewTensor(aOrder, aDimSize);
+    XTensor * b = NewTensor(bOrder, bDimSize);
+    /* initialize variables */
+    a->SetData(aData, aUnitNum);
+    b->SetData(bData, bUnitNum);
+    /* call SumByColumnVT function */
+    SumByColumnVT(a, b);
+    /* check results */
+    cpuTest = a->CheckData(answer, aUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * aGPU = NewTensor(aOrder, aDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * bGPU = NewTensor(bOrder, bDimSize, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    aGPU->SetData(aData, aUnitNum);
+    bGPU->SetData(bData, bUnitNum);
+    /* call SumByColumnVT function */
+    SumByColumnVT(aGPU, bGPU);
+    /* check results */
+    gpuTest = aGPU->CheckData(answer, aUnitNum);
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete aGPU;
+    delete bGPU;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete a;
+    delete b;
+    delete[] aDimSize;
+    delete[] bDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for SumByColumnVT Function */
+bool TestSumByColumnVT() 
+{
+    XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestSumByColumnVT1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestSumByColumnVT2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+        TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TSumByColumnVT.h
+++ b/source/test/TSumByColumnVT.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-07-06
+*/
+#ifndef __TEST_SUMBYCOLUMNVT_H__
+#define __TEST_SUMBYCOLUMNVT_H__
+#include "../core/arithmetic/SumByColumnVT.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for SumByColumnVT Function */
+extern "C"
+bool TestSumByColumnVT();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_SUMBYCOLUMNVT_H__
--- a/source/test/TTopK.cpp
+++ b/source/test/TTopK.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#include "TTopK.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: get the top-k items along a given dimension.
+In this case, 
+(2, 4) -> (2, 4), dim = 0, k = 2
+(2, 4) -> (2, 4), dim = 1, k = 4
+*/
+bool TestTopK1()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (2, 4) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 4;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
+                          {4.0F, 3.0F, 7.0F, 6.0F} };
+    DTYPE tAnswer1[2][4] = { {5.0F, 3.0F, 7.0F, 8.0F},
+                             {4.0F, 1.0F, 2.0F, 6.0F} };
+    int indexAnswer1[2][4] = { {0, 1, 1, 0},
+                               {1, 0, 0, 1} };
+    DTYPE tAnswer2[2][4] = { {8.0F, 5.0F, 2.0F, 1.0F},
+                             {7.0F, 6.0F, 4.0F, 3.0F} };
+    int indexAnswer2[2][4] = { {3, 0, 2, 1},
+                               {2, 3, 0, 1} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t1 = NewTensor(tOrder, tDimSize);
+    XTensor * t2 = NewTensor(tOrder, tDimSize);
+    XTensor * index1 = NewTensor(tOrder, tDimSize, X_INT);
+    XTensor * index2 = NewTensor(tOrder, tDimSize, X_INT);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
+    index1->SetZeroAll();
+    index2->SetZeroAll();
+    /* call TopK function */
+    int dim = 0;
+    int k = sDimSize[dim];
+    TopK(s, t1, index1, dim, k);
+    dim = 1;
+    k = sDimSize[dim];
+    TopK(s, t2, index2, dim, k);
+    /* check results */
+    cpuTest = t1->CheckData(tAnswer1, tUnitNum) && 
+              t2->CheckData(tAnswer2, tUnitNum) &&
+              index1->CheckData(indexAnswer1, tUnitNum) &&
+              index2->CheckData(indexAnswer2, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * indexGPU1 = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
+    XTensor * indexGPU2 = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
+    indexGPU1->SetZeroAll();
+    indexGPU2->SetZeroAll();
+    /* call TopK function */
+    dim = 0;
+    k = sDimSize[dim];
+    TopK(sGPU, tGPU1, indexGPU1, dim, k);
+    dim = 1;
+    k = sDimSize[dim];
+    TopK(sGPU, tGPU2, indexGPU2, dim, k);
+    /* check results */
+    gpuTest = tGPU1->CheckData(tAnswer1, tUnitNum) && 
+              tGPU2->CheckData(tAnswer2, tUnitNum) &&
+              indexGPU1->CheckData(indexAnswer1, tUnitNum) &&
+              indexGPU2->CheckData(indexAnswer2, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete index1;
+    delete index2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete indexGPU1;
+    delete indexGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete index1;
+    delete index2;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/*
+case 2: get the top-k items along a given dimension.
+In this case, (2, 4) -> (2, 2), dim = 1, k = 2.
+*/
+bool TestTopK2()
+{
+    /* a input tensor of size (2, 4) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 4;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a output tensor of size (2, 2) */
+    int tOrder = 2;
+    int * tDimSize = new int[tOrder];
+    tDimSize[0] = 2;
+    tDimSize[1] = 2;
+    int tUnitNum = 1;
+    for (int i = 0; i < tOrder; i++)
+        tUnitNum *= tDimSize[i];
+    DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
+                          {4.0F, 3.0F, 7.0F, 6.0F} };
+    DTYPE tAnswer[2][2] = { {8.0F, 5.0F},
+                            {7.0F, 6.0F} };
+    int indexAnswer[2][2] = { {3, 0},
+                              {2, 3} };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t = NewTensor(tOrder, tDimSize);
+    XTensor * index = NewTensor(tOrder, tDimSize, X_INT);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t->SetZeroAll();
+    index->SetZeroAll();
+    /* call TopK function */
+    int dim = 1;
+    int k = tDimSize[dim];
+    TopK(s, t, index, dim, k);
+    /* check results */
+    cpuTest = t->CheckData(tAnswer, tUnitNum) && index->CheckData(indexAnswer, tUnitNum);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensors */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * indexGPU = NewTensor(tOrder, tDimSize, X_INT, 1.0F, 0);
+    /* initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU->SetZeroAll();
+    indexGPU->SetZeroAll();
+    /* call TopK function */
+    dim = 1;
+    k = tDimSize[dim];
+    TopK(sGPU, tGPU, indexGPU, dim, k);
+    /* check results */
+    gpuTest = tGPU->CheckData(tAnswer, tUnitNum) && indexGPU->CheckData(indexAnswer, tUnitNum);
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete index;
+    delete sGPU;
+    delete tGPU;
+    delete indexGPU;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t;
+    delete index;
+    delete[] sDimSize;
+    delete[] tDimSize;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+TODO!!
+*/
+/* test for TopK Function */
+bool TestTopK()
+{
+    XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestTopK1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* case 2 test */
+    caseFlag = TestTopK2();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+    }
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TTopK.h
+++ b/source/test/TTopK.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_TOPK_H__
+#define __TEST_TOPK_H__
+#include "../core/sort/TopK.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for TopK Function */
+extern "C"
+bool TestTopK();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_TOPK_H__
--- a/source/test/TUnsqueeze.cpp
+++ b/source/test/TUnsqueeze.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13
+*/
+#include "../XList.h"
+#include "TUnsqueeze.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* 
+case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
+In this case, 
+(2, 3) -> (2, 2, 3), dim=1, dSize=2
+(2, 3) -> (2, 3, 2), dim=2, dSize=2
+*/
+bool TestUnsqueeze1()
+{
+    /* a source tensor of size (2, 3) */
+    int sOrder = 2;
+    int * sDimSize = new int[sOrder];
+    sDimSize[0] = 2;
+    sDimSize[1] = 3;
+    int sUnitNum = 1;
+    for (int i = 0; i < sOrder; i++)
+        sUnitNum *= sDimSize[i];
+    /* a target tensor of size (2, 2, 3) */
+    int tOrder1 = 3;
+    int * tDimSize1 = new int[tOrder1];
+    tDimSize1[0] = 2;
+    tDimSize1[1] = 2;
+    tDimSize1[2] = 3;
+    int tUnitNum1 = 1;
+    for (int i = 0; i < tOrder1; i++)
+        tUnitNum1 *= tDimSize1[i];
+    /* a target tensor of size (2, 3, 2) */
+    int tOrder2 = 3;
+    int * tDimSize2 = new int[tOrder2];
+    tDimSize2[0] = 2;
+    tDimSize2[1] = 3;
+    tDimSize2[2] = 2;
+    int tUnitNum2 = 1;
+    for (int i = 0; i < tOrder2; i++)
+        tUnitNum2 *= tDimSize2[i];
+    DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
+                          {3.0F, 4.0F, 5.0F} };
+    DTYPE answer1[2][2][3] = { { {0.0F, 1.0F, 2.0F},
+                                 {0.0F, 1.0F, 2.0F} },
+                               { {3.0F, 4.0F, 5.0F},
+                                 {3.0F, 4.0F, 5.0F} } };
+    DTYPE answer2[2][3][2] = { { {0.0F, 0.0F}, 
+                                 {1.0F, 1.0F}, 
+                                 {2.0F, 2.0F} },
+                               { {3.0F, 3.0F}, 
+                                 {4.0F, 4.0F}, 
+                                 {5.0F, 5.0F} } };
+    /* CPU test */
+    bool cpuTest = true;
+    /* create tensors */
+    XTensor * s = NewTensor(sOrder, sDimSize);
+    XTensor * t1 = NewTensor(tOrder1, tDimSize1);
+    XTensor * t2 = NewTensor(tOrder2, tDimSize2);
+    /* initialize variables */
+    s->SetData(sData, sUnitNum);
+    t1->SetZeroAll();
+    t2->SetZeroAll();
+    /* call Unsqueeze function */
+    Unsqueeze(s, t1, 1, 2);
+    Unsqueeze(s, t2, 2, 2);
+    /* check results */
+    cpuTest = t1->CheckData(answer1, tUnitNum1) && t2->CheckData(answer2, tUnitNum2);
+#ifdef USE_CUDA
+    /* GPU test */
+    bool gpuTest = true;
+    /* create tensor */
+    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU1 = NewTensor(tOrder1, tDimSize1, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU2 = NewTensor(tOrder2, tDimSize2, X_FLOAT, 1.0F, 0);
+    /* Initialize variables */
+    sGPU->SetData(sData, sUnitNum);
+    tGPU1->SetZeroAll();
+    tGPU2->SetZeroAll();
+    /* call Unsqueeze function */
+    Unsqueeze(sGPU, tGPU1, 1, 2);
+    Unsqueeze(sGPU, tGPU2, 2, 2);
+    /* check results */
+    gpuTest = tGPU1->CheckData(answer1, tUnitNum1) && tGPU2->CheckData(answer2, tUnitNum2);
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete sGPU;
+    delete tGPU1;
+    delete tGPU2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest && gpuTest;
+#else
+    /* destroy variables */
+    delete s;
+    delete t1;
+    delete t2;
+    delete[] sDimSize;
+    delete[] tDimSize1;
+    delete[] tDimSize2;
+    return cpuTest;
+#endif // USE_CUDA
+}
+/* other cases */
+/*
+    TODO!!
+*/
+/* test for Unsqueeze Function */
+bool TestUnsqueeze()
+{
+    XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n");
+    bool returnFlag = true, caseFlag = true;
+    /* case 1 test */
+    caseFlag = TestUnsqueeze1();
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
+    /* other cases test */
+    /*
+    TODO!!
+    */
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
+    XPRINT(0, stdout, "\n");
+    return returnFlag;
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/test/TUnsqueeze.h
+++ b/source/test/TUnsqueeze.h
+/* NiuTrans.Tensor - an open-source tensor library
+* Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+* All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+/*
+* $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-27
+*/
+#ifndef __TEST_UNSQUEEZE_H__
+#define __TEST_UNSQUEEZE_H__
+#include "../core/shape/Unsqueeze.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+/* test for Unsqueeze Function */
+extern "C"
+bool TestUnsqueeze();
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TEST_UNSQUEEZE_H__
--- a/source/test/TXMem.cpp
+++ b/source/test/TXMem.cpp
@@ -19,14 +19,13 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-6-24
 */
-#include "TXMem.h"
 #include "../XGlobal.h"
 #include "../XUtility.h"
-#include "../XMem.h"
+#include "TXMem.h"
-/* the nts (NiuTrans.Tensor) namespace */
+namespace nts{ // namespace nts(NiuTrans.Tensor)
-namespace nts{
+/* case 1: test memory pool class */
 bool TestXMemCase1()
 {
    bool ok = true;
@@ -83,6 +82,7 @@ bool TestXMemCase1()
    return ok;
 }
+/* test for memory pool class */
 bool TestXMem()
 {
    XPRINT(0, stdout, "[Test] Memory pool ... Began\n");
@@ -93,11 +93,18 @@ bool TestXMem()
    /* case 1 test */
    caseFlag = TestXMemCase1();
-    if (!caseFlag) { returnFlag = false; XPRINT(0, stdout, ">> case 1 failed!\n"); }
+    if (!caseFlag) {
-    else {XPRINT(0, stdout, ">> case 1 passed!\n");}
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");
-    if (returnFlag) { XPRINT(0, stdout, ">> All Passed!\n"); }
+    if (returnFlag) {
-    else { XPRINT(0, stdout, ">> Failed!\n"); }
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");
    double endT = GetClock();
@@ -106,4 +113,4 @@ bool TestXMem()
    return returnFlag;
 }
-} /* end of the nts (NiuTrans.Tensor) namespace */
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/test/TXMem.h
+++ b/source/test/TXMem.h
@@ -22,13 +22,13 @@
 #ifndef __TXMEM_H__
 #define __TXMEM_H__
-/* the nts (NiuTrans.Tensor) namespace */
+#include "../XMem.h"
-namespace nts{
+namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* test for memory pool class */
 extern "C"
 bool TestXMem();
-} /* end of the nts (NiuTrans.Tensor) namespace */
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TXMEM_H__
-#endif
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -31,26 +31,42 @@ bool Test()
    wrong = !TestConcatenate() || wrong;
    wrong = !TestConcatenateSolely() || wrong;
+    //wrong = !TestCopyIndexed() || wrong;
+    wrong = !TestCopyValues() || wrong;
    wrong = !TestMatrixMul() || wrong;
    wrong = !TestMatrixMul2D() || wrong;
+    wrong = !TestMatrixMul2DParallel() || wrong;
+    //wrong = !TestMatrixMulBatched() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
    wrong = !TestMultiply() || wrong;
    wrong = !TestNegate() || wrong;
    wrong = !TestNormalize() || wrong;
-    //wrong = !TestPower() || wrong;
+    wrong = !TestPower() || wrong;
    wrong = !TestReduceMax() || wrong;
    wrong = !TestReduceMean() || wrong;
    wrong = !TestReduceSum() || wrong;
+    wrong = !TestReduceSumSquared() || wrong;
+    wrong = !TestReduceVariance() || wrong;
+    wrong = !TestScaleAndShift() || wrong;
+    wrong = !TestSelect() || wrong;
+    wrong = !TestSetAscendingOrder() || wrong;
+    wrong = !TestSetData() || wrong;
    wrong = !TestSort() || wrong;
    wrong = !TestSplit() || wrong;
    wrong = !TestSum() || wrong;
+    wrong = !TestSumByColumnTV || wrong;
+    //wrong = !TestSumByColumnVT() || wrong;
+    wrong = !TestTopK() || wrong;
+    wrong = !TestUnsqueeze() || wrong;
    wrong = !TestXMem() || wrong;
    //wrong = !TestHardTanH() || wrong;
+    //wrong = !TestIdentity() || wrong;
+    //wrong = !TestLogSoftmax() || wrong;
    //wrong = !TestLoss() || wrong;
    //wrong = !TestRectify() || wrong;
-    wrong = !TestSigmoid() || wrong;
+    //wrong = !TestSigmoid() || wrong;
    //wrong = !TestSoftmax() || wrong;
    /* other test */

--- a/source/test/Test.h
+++ b/source/test/Test.h
@@ -24,8 +24,12 @@
 #include "TConcatenate.h"
 #include "TConcatenateSolely.h"
+#include "TCopyIndexed.h"
+#include "TCopyValues.h"
 #include "TMatrixMul.h"
 #include "TMatrixMul2D.h"
+#include "TMatrixMul2DParallel.h"
+#include "TMatrixMulBatched.h"
 #include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
 #include "TMultiply.h"
@@ -35,12 +39,24 @@
 #include "TReduceMax.h"
 #include "TReduceMean.h"
 #include "TReduceSum.h"
+#include "TReduceSumSquared.h"
+#include "TReduceVariance.h"
+#include "TScaleAndShift.h"
+#include "TSelect.h"
+#include "TSetAscendingOrder.h"
+#include "TSetData.h"
 #include "TSort.h"
 #include "TSplit.h"
 #include "TSum.h"
+#include "TSumByColumnTV.h"
+#include "TSumByColumnVT.h"
+#include "TTopK.h"
+#include "TUnsqueeze.h"
 #include "TXMem.h"
 #include "THardTanH.h"
+#include "TIdentity.h"
+#include "TLogSoftmax.h"
 #include "TLoss.h"
 #include "TRectify.h"
 #include "TSigmoid.h"