Format correction.

0887fae1 · liyinqiao · 42f995ae · 0887fae1 · 0887fae1 · 0887fae1
Commit 0887fae1 authored Jul 07, 2018 by liyinqiao
--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
@@ -28,6 +28,10 @@

 #include "Concatenate.h"
 #include "ConcatenateSolely.h"
+#include "CopyBlocks.h"
+#include "CopyBlocksInGrid.h"
+#include "CopyBlocksOnSite.h"
+#include "CopyData2D.h"
 #include "CopyIndexed.h"
 #include "CopyInGrid.h"
 #include "CopyValues.h"
@@ -53,6 +57,7 @@
 #include "ReduceSumSquared.h"
 #include "ReduceVariance.h"
 #include "ScaleAndShift.h"
+#include "Select.h"
 #include "SetData.h"
 #include "Sort.h"
 #include "Split.h"

--- a/source/core/Concatenate.cpp
+++ b/source/core/Concatenate.cpp
@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)

 /*
 concatenate two tensors along a given dimension
+>> smallA - one tensor for concatenation
+>> smallB - the other tensor for concatenation
+>> big - the resulting tensor
+>> dim - which dimension we perform the concatenation
 */
 void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
 {

--- a/source/core/Concatenate.h
+++ b/source/core/Concatenate.h
@@ -29,7 +29,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 concatenate a list of tensors along a given dimension
 Note that this is actually a wrapper that selects "ConcatenateSolely"
-or "Merge" by means of the tensor shapes */
+or "Merge" by means of the tensor shapes 
+*/
 void Concatenate(XList * smalls, XTensor * big, int dim);

 /* concatenate two tensors along a given dimension */

--- a/source/core/ConcatenateSolely.cpp
+++ b/source/core/ConcatenateSolely.cpp
@@ -64,9 +64,11 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)

    int offset = 0;

-    /* two strategies are used - we can either resort to memcpy2d for the case of
+    /* 
+    two strategies are used - we can either resort to memcpy2d for the case of
    concatenation of a few items, or use MergeBlockLists to merge a large number
-    of data blocks */
+    of data blocks 
+    */
    if (smalls->count <= MIN_TENSOR_CAT_NUM) {
        for (int i = 0; i < smalls->count; i++) {
            XTensor * tensor = (XTensor*)smalls->GetItem(i);

--- a/source/core/ConcatenateSolely.h
+++ b/source/core/ConcatenateSolely.h
@@ -26,7 +26,6 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-
 /* concatenate a list of tensors along a given dimension */
 extern "C"
 void ConcatenateSolely(XList * smalls, XTensor * big, int dim);

--- a/source/core/CopyBlocks.cpp
+++ b/source/core/CopyBlocks.cpp
@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, 
    else {
        int devID = myMem != NULL ? myMem->devID : -1;

-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0; i < blockNum; i++) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);

--- a/source/core/CopyBlocksOnSite.cpp
+++ b/source/core/CopyBlocksOnSite.cpp
@@ -25,6 +25,7 @@
 #include "CopyBlocksOnSite.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 copy a number of blocks to target positions. Here we assume that
 all the data has been on the device (CPU/GPU) already.
@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
    else {
        int devID = myMem != NULL ? myMem->devID : -1;

-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + b, devID, blockSize);

--- a/source/core/CopyInGrid.cpp
+++ b/source/core/CopyInGrid.cpp
@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
 in the k-th grid
 >> blockDim - leading dimension of blocks
 >> blockNumInGrid - number of blocks in each grid
->> isOnDev - indicates whether the index is on the device already
+>> isIndexOnDev - indicates whether the index is on the device already
 */
 void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {

--- a/source/core/CopyIndexed.cpp
+++ b/source/core/CopyIndexed.cpp
@@ -36,6 +36,7 @@ copy indexed sub-tensors
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, e.g.,
 for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - whether copy indexed operation was successful
 */
 bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {

--- a/source/core/CopyValues.cuh
+++ b/source/core/CopyValues.cuh
@@ -28,7 +28,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/**************************************/
 /* copy all elements from a source matrix to a target matrix */
 extern "C"
 bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);

--- a/source/core/FlushToMem.cu
+++ b/source/core/FlushToMem.cu
@@ -52,7 +52,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
        else
            reqiredSize = m->unitSize * m->unitNum;

-        //reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + size, reqiredSize);
        size += reqiredSize;
    }

@@ -70,7 +69,6 @@ void CudaCPUToGPUFlush(XList * mList, int devID, XMem * GPUMem)
        else
            pSize = m->unitSize * m->unitNum;

-        //reqiredSize = (int)GPUMem->GetPitch(GPUMem->devID, (MTYPE)GPUMem->GetAddress() + p, pSize);
        reqiredSize = pSize;

        memcpy(data + p, m->data, pSize);

--- a/source/core/MakeSplitBlockIndex.cpp
+++ b/source/core/MakeSplitBlockIndex.cpp
@@ -24,6 +24,7 @@
 #include "MakeSplitBlockIndex.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 set target data block index for the data movement in split
 >> blockIndex - block index

--- a/source/core/MakeSplitBlockIndex.cu
+++ b/source/core/MakeSplitBlockIndex.cu
@@ -51,6 +51,7 @@ void KernelMakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSiz

 /*
 set target data block index for the data movement in split
+>> devID - device id
 >> blockIndex - block index
 >> splitNum - number of splits
 >> blockSplitSize - size of the splitted block

--- a/source/core/MatrixMULBatchedCPU.cpp
+++ b/source/core/MatrixMULBatchedCPU.cpp
@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> transposedA - indicate whether the matrix a is transposed
 >> b - another list of input matrices (2d tensors)
 >> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
 >> alpha - scalar
 >> beta - scalar
->> c - output matrix (2d tensor)
 */
 void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
    XList * b, MATRIX_TRANS_TYPE transposedB,
@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        }
    }

-    //if(isUniform){
-    //}
-    //else{
-
    for (int i = 0; i < a->count; i++) {
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);

--- a/source/core/MatrixMul.h
+++ b/source/core/MatrixMul.h
@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
 */
 extern "C"
 void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/core/MatrixMul2D.cpp
+++ b/source/core/MatrixMul2D.cpp
@@ -104,7 +104,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            int num = *((int*)b->data);
            char * p = (char*)b->data + sizeof(int); // pointer to the first tuple

-                                                            /* a * b */
+            /* a * b */
            if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
                for (int i = 0; i < num; i++) {
                    int key = *((int*)p);

--- a/source/core/MatrixMul2D.cu
+++ b/source/core/MatrixMul2D.cu
@@ -37,11 +37,13 @@ c = a * b * \alpha
 >> aColSize - column size of matrix a
 >> aRowSize - row size of matrix a
 >> b - a sparse matrix
->> transposedA - indicates whether b is transposed
+>> transposedB - indicates whether b is transposed
 >> bNonZeroNum - number of non-zero items in b
 >> bColSize - column size of matrix b
 >> bRowSize - row size of matrix b
 >> c - the resulting (dense) matrix
+>> cColSize - column size of matrix c
+>> cRowSize - row size of matrix c
 >> alpha - the scaling factor
 */
 extern "C" __global__
@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");

-        //cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();

        /* !!!! might have problems */
@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            if (beta == 0)
                c->SetZeroAll();
            else if (beta != 1.0F) {
-                //XTensor::ScaleAndShift(c, beta, 0);
                ShowNTErrors("TODO!");
            }


--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -40,6 +40,7 @@ where trans() returns the transposed matrix if the flag is fired
 >> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
+>> parallelRunner - parallel processing module
 */
 void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XTensor * b, MATRIX_TRANS_TYPE transposedB,

--- a/source/core/Merge.cpp
+++ b/source/core/Merge.cpp
@@ -27,7 +27,6 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-
 /*
 transform a tensor by merging it alone with a dimension, e.g., (N/3, M, 3) -> (N, M)
 >> s - the source tensor

--- a/source/core/MergeBlockLists.cpp
+++ b/source/core/MergeBlockLists.cpp
@@ -27,12 +27,12 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-    merge data by blocks
-    >> sourceList - list of source data array
-    >> blockSizes - list of the block size for each source data array
-    >> blockNum - number of blocks kept in each data array
-    >> target - target data array
-    >> myMem - memory pool
+merge data by blocks
+>> sourceList - list of source data array
+>> blockSizes - list of the block size for each source data array
+>> blockNum - number of blocks kept in each data array
+>> target - target data array
+>> myMem - memory pool
 */
 void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {

--- a/source/core/MergeBlockLists.cu
+++ b/source/core/MergeBlockLists.cu
@@ -34,10 +34,9 @@ copy a number of blocks (of different sizes) to target positions
 >> sourceBlockSizes - the size of the block_i
 >> sourceBlockNum - number of blocks to merge
 >> targetList - list of data arrays to copy to
->> target - target data array
 */
 __global__
-    void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
+void KernelCopyBlockLists(DTYPE * sourceList[], int * sourceBlockSizes, int sourceBlockNum, DTYPE * targetList[])
 {
    __shared__ int iBlockSizes[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE * iSourceList[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -82,7 +81,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi

    int minBlockSize = MAX_INT;
    int maxBlockSize = -MAX_INT;
-    //int realMinBlockSize = 1;
    int realMaxBlockSize = 1;
    DTYPE ** sourceArrays = new DTYPE*[newBlockListSize];
    DTYPE ** targetArrays = new DTYPE*[newBlockListSize];
@@ -110,7 +108,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi

    CheckNTErrors((minBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
    CheckNTErrors((maxBlockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
-    //realMinBlockSize = minBlockSize/sizeof(DTYPE);
    realMaxBlockSize = maxBlockSize / sizeof(DTYPE);

    int cudaGridSizes[3];
@@ -120,31 +117,16 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
                          cudaGridSizes, cudaBlockSizes);

    myMem->SetPinBuf();
-    //MTYPE offset0 = myMem->bufUsed;
    int * sizesGPU = (int*)myMem->AllocBuf(myMem->devID, sizeof(int) * newBlockListSize, 256);

-    //MTYPE offset1 = myMem->bufUsed;
    DTYPE ** sourceArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);

-    //MTYPE offset2 = myMem->bufUsed;
    DTYPE ** targetArraysGPU = (DTYPE**)myMem->AllocBuf(myMem->devID, sizeof(DTYPE*) * newBlockListSize, 256);

-    //MTYPE bufSize = myMem->bufUsed - offset0;
-
-    //char * CPUBuf = new char[bufSize];
-    //memset(CPUBuf, 0 , bufSize);
-
-    //memcpy(CPUBuf, sizes, sizeof(int) * newBlockListSize);
-    //memcpy(CPUBuf + (offset1 - offset0), sourceArrays, sizeof(DTYPE*) * newBlockListSize);
-    //memcpy(CPUBuf + (offset2 - offset0), targetArrays, sizeof(DTYPE*) * newBlockListSize);
-
    XMemCopy(sizesGPU, myMem->devID, sizes, -1, sizeof(int) * newBlockListSize);
    XMemCopy(sourceArraysGPU, myMem->devID, sourceArrays, -1, sizeof(DTYPE*) * newBlockListSize);
    XMemCopy(targetArraysGPU, myMem->devID, targetArrays, -1, sizeof(DTYPE*) * newBlockListSize);

-    /* it is VERY tricky here because we squeeze three data copies into one */
-    //XMemCopy(sizesGPU, myMem->devID, CPUBuf, -1, bufSize);
-
    KernelCopyBlockLists << <dim3(cudaGridSizes[0], cudaGridSizes[1]), dim3(cudaBlockSizes[0], cudaBlockSizes[1]) >> >
                            (sourceArraysGPU, sizesGPU, newBlockListSize, targetArraysGPU);

@@ -154,7 +136,6 @@ void CudaMergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, voi
    delete[] targetArrays;
    delete[] sizes;
    delete[] offsets;
-    //delete[] CPUBuf;
 }
 #endif // USE_CUDA


--- a/source/core/MultiplyElementWise.cpp
+++ b/source/core/MultiplyElementWise.cpp
@@ -24,6 +24,7 @@
 #include "MultiplyElementWise.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)

--- a/source/core/MultiplyElementWise.cu
+++ b/source/core/MultiplyElementWise.cu
@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
+>> alpha - the coefficient
 >> stride - the number of items we go over when move next along the leading dimension in a block
 >> ldSizeA - size of the leading dimension of a
 >> ldSizeB - size of the leading dimension of b

--- a/source/core/Negate.cpp
+++ b/source/core/Negate.cpp
@@ -26,8 +26,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /*
-    set every entry to its minus value
-    >> a - the tensor we are processing
+set every entry to its minus value
+>> a - the tensor we are processing
 */
 void Negate(XTensor * a)
 {

--- a/source/core/Negate.cu
+++ b/source/core/Negate.cu
@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
 }

 /*
-    set each entry to its negtive value (CUDA Kernel)
-    This is for float16 computation
-    >> d - pointer to the data array
-    >> size - size of the data array
+set each entry to its negtive value (CUDA Kernel)
+This is for float16 computation
+>> d - pointer to the data array
+>> size - size of the data array
 */
 __global__
 void KernelNegate(__half * d, int size)

--- a/source/core/Normalize.cpp
+++ b/source/core/Normalize.cpp
@@ -25,6 +25,7 @@
 #include "Normalize.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/core/Normalize.cu
+++ b/source/core/Normalize.cu
@@ -25,6 +25,7 @@
 #include "Normalize.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 #ifdef USE_CUDA
 /*
 normalized the data with normal distribution (kernel code). For an input x,

--- a/source/core/Normalize.cuh
+++ b/source/core/Normalize.cuh
@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/* normalized the data with normal distribution (Kernel code). For an input x,
+/* 
+normalized the data with normal distribution (Kernel code). For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
    DTYPE * a, DTYPE * b, DTYPE epsilon,
    int stride, int strideNum, int blockNum);

-/* normalized the data with normal distribution. For an input x,
+/* 
+normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */

--- a/source/core/Power.cpp
+++ b/source/core/Power.cpp
@@ -25,10 +25,11 @@
 #include "Power.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 get the power(a, p)
 >> a - the tensor
->> power - as it is
+>> p - as it is
 */
 void Power(XTensor * a, DTYPE p)
 {

--- a/source/core/Power.cu
+++ b/source/core/Power.cu
@@ -87,9 +87,6 @@ __global__
 void KernelPower(__half * d, __half p, int size)
 {
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    //int i = blockDim.x * blockIdx.x + threadIdx.x;
-    //if (i < size)
-    //    d[i] = hpow(d[i], p);
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
        }
        else if (p != (DTYPE)1.0) {
            ShowNTErrors("TODO!");
-            //unsigned short p2 = FloatToFloat16(p);
-            //__half * pp = (__half*)&p2;
-            //KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
        }
    }
    else {

--- a/source/core/ReduceMax.cu
+++ b/source/core/ReduceMax.cu
@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
-
 Given a block of data, we go over each dimension i in the stride and we have
-
 sum_i = max_{0<=j<strideNum} input_{i,j}
-
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
-
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,

 }

- /*
- reduce a tensor to another that keeps the max value along a dimension  - slow version
-
- Given a block of data, we go over each dimension i in the stride and we have
-
- sum_i = max_{0<=j<strideNum} input_{i,j}
-
- where we can view the block as a matrix and input_{i,j} represent the item at the
- crossing of the i-th columne and the j-th row.
-
- >> input - the input array (representing a tensor)
- >> output - the sum over each block. NOTE: output is also an array
- >> stride - stride that we need to move to the next item
- >> strideNum - how many strides we need to finish the reduce
- >> reducedStrideNum - the number of strides after reducation
- >> blockSize - size of the block (i.e., stride * strideNum)
- >> blockNum - how many blocks
- */
- __global__
- void KernelReduceMax(__half * input, __half * output,
-         int stride, int strideNum, int reducedStrideNum,
-         int blockSize, int blockNum)
- {
-     int idx = threadIdx.x * blockDim.y + threadIdx.y;
-     unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-     unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
+/*
+reduce a tensor to another that keeps the max value along a dimension  - slow version
+Given a block of data, we go over each dimension i in the stride and we have
+sum_i = max_{0<=j<strideNum} input_{i,j}
+where we can view the block as a matrix and input_{i,j} represent the item at the
+crossing of the i-th columne and the j-th row.
+>> input - the input array (representing a tensor)
+>> output - the sum over each block. NOTE: output is also an array
+>> stride - stride that we need to move to the next item
+>> strideNum - how many strides we need to finish the reduce
+>> reducedStrideNum - the number of strides after reducation
+>> blockSize - size of the block (i.e., stride * strideNum)
+>> blockNum - how many blocks
+*/
+__global__
+void KernelReduceMax(__half * input, __half * output,
+        int stride, int strideNum, int reducedStrideNum,
+        int blockSize, int blockNum)
+{
+    int idx = threadIdx.x * blockDim.y + threadIdx.y;
+    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;

-     if (i >= stride * blockNum)
-         return;
+    if (i >= stride * blockNum)
+        return;

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
+    __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
 #else
-     __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
 #endif

-     __syncthreads();
+    __syncthreads();

-     int k = i / stride;
-     int iOffset = i % stride;
+    int k = i / stride;
+    int iOffset = i % stride;

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     __half value = (i < stride * blockNum && j < strideNum) ?
+    __half value = (i < stride * blockNum && j < strideNum) ?
         input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
 #else
-     DTYPE value = (i < stride * blockNum && j < strideNum) ?
-         __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
+    DTYPE value = (i < stride * blockNum && j < strideNum) ?
+        __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
 #endif

-     /* load data into the shared mem */
-     iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
+    /* load data into the shared mem */
+    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;

-     __syncthreads();
+    __syncthreads();

-     /* do reduction in shared mem */
-     for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
-         if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
-             iData[idx] = iData[idx + s];
-         }
+    /* do reduction in shared mem */
+    for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
+        if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
+            iData[idx] = iData[idx + s];
+        }

-         __syncthreads();
-     }
+        __syncthreads();
+    }

 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     /* write result for this block to the output array */
-     if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
-         output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
+    /* write result for this block to the output array */
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
 #else
-     /* write result for this block to the output array */
-     if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
-         output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
+    /* write result for this block to the output array */
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
 #endif

 }

-
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - fast version
 >> input - the input array (representing a tensor)
@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
-
 sum_i = max_{0<=j<strideNum} input_{i,j}
-
 >> input - the input tensor
 >> output - the output tensor
 >> dim - which dimension to reduce

--- a/source/core/ReduceMean.cpp
+++ b/source/core/ReduceMean.cpp
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
 mean = (1/n) * sum_i input_i
-
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
    ScaleAndShift(output, (DTYPE)1/num, 0);
 }

-
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/ReduceSum.cu
+++ b/source/core/ReduceSum.cu
@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
-
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
-
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 This is for float16 reduction.
-
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
-
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - fast version
 This is for float16 reduction
-
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item

--- a/source/core/ReduceSumSquared.cpp
+++ b/source/core/ReduceSumSquared.cpp
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 squared sum of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
-
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ReduceVariance.cpp
+++ b/source/core/ReduceVariance.cpp
@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 variance of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 variance = 1/n * \sum_i (a_i - mean)^2
-
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ScaleAndShift.cpp
+++ b/source/core/ScaleAndShift.cpp
@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)

 /* 
 scale and shift all tensor entires
-
 p = p * scale + shift
-
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/ScaleAndShift.cu
+++ b/source/core/ScaleAndShift.cu
@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)

 /* 
 scale and shift all matrix entires
-
 p = p * scale + shift
-
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -31,7 +31,7 @@ c = select(a)
 >> dim - the dimension along with which we do the job
 >> low - lower bound
 >> high - higher bound.
-          Note that range [1,3] means that we select 1 and 2.
+Note that range [1,3] means that we select 1 and 2.
 >> c - result tensor
 */
 void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
@@ -75,5 +75,4 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
    }
 }

-
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/SetData.cpp
+++ b/source/core/SetData.cpp
@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
            ShowNTErrors("TODO");
        }
    }
-    /* GPU code
-       The trick here is that initialize the data on a temperary tensor on CPU.
-       The CPU data is then copied to GPU.
-       TODO: generate data points on GPUs straightforwardly.
+    /* 
+    GPU code
+    The trick here is that initialize the data on a temperary tensor on CPU.
+    The CPU data is then copied to GPU.
+    TODO: generate data points on GPUs straightforwardly.
    */
    else{
        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);

--- a/source/core/Sort.cpp
+++ b/source/core/Sort.cpp
@@ -39,6 +39,7 @@ void Sort(XTensor * a, XTensor * index, int dim)
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");

 	int dimRDI = a->order - dim - 1;
+
    /* make the index tensor */
    index->SetAscendingOrder(dim);


--- a/source/core/Sort.cuh
+++ b/source/core/Sort.cuh
@@ -29,6 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA

 /* sort the tensor along a given dimension */
+extern "C"
 void CudaSortBig(XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);

 #endif // USE_CUDA

--- a/source/core/Split.h
+++ b/source/core/Split.h
@@ -27,6 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
+extern "C"
 void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);

 /* split a big tensor into small tensors */

--- a/source/core/Sum.cu
+++ b/source/core/Sum.cu
@@ -25,6 +25,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA
+
 /*
 summation of data arrays (CUDA Kernel)
 c = a  + b * \beta

--- a/source/core/Sum.cuh
+++ b/source/core/Sum.cuh
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-		/* summation of data arrays (CUDA Kernel) */
+/* summation of data arrays (CUDA Kernel) */
 extern "C" __global__
 void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);


--- a/source/core/SumByColumnVT.cu
+++ b/source/core/SumByColumnVT.cu
@@ -27,6 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA
+
 /*
 summation of a vector (column vector) and a tensor
 c = a + \sum{col} b_col * \beta

--- a/source/core/SumByColumnVT.h
+++ b/source/core/SumByColumnVT.h
@@ -26,7 +26,6 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-
 /* sum of a (column) vector and a tensor */
 extern "C"
 void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

--- a/source/core/TopK.cpp
+++ b/source/core/TopK.cpp
@@ -24,6 +24,7 @@
 #include "TopK.cuh"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /*
 get the top-k items along a given dimension
 >> a - input tensor

--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
@@ -95,9 +95,11 @@ public:
    /* swap */
    __device__ void Swap(int i, int j)
    {
-        /*CudaHeapNode<T> tmp = items[i];
+        /*
+		CudaHeapNode<T> tmp = items[i];
        items[i] = items[j];
-        items[j] = tmp;*/
+        items[j] = tmp;
+		*/
        int tmpIndex = items[i].index;
        T tmpValue = items[i].value;
        items[i] = items[j];
@@ -239,8 +241,10 @@ void KernelTopK(T * input, int stride, int strideNum, int blockNum, int k, T min
    if (threadIdx.x == 0) {
        CudaXHeap<MIN_HEAP, T> heapFinal(k, k, heapData + k * threadIdx.y * blockDim.x);

-        /* merge the result over the workers.
-        This can be improved by parallel merging */
+        /* 
+		merge the result over the workers.
+        This can be improved by parallel merging 
+		*/
        if (blockDim.x > 1) {
            for (int p = 1; p < blockDim.x && p < strideNum; p++) {
                CudaHeapNode<T> * hd = heapData + k * (threadIdx.y * blockDim.x + p);
@@ -429,6 +433,7 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        }

    }
+
    /* we resort to sorting if the data cannot fit inside the shared memory */
    else {
        int dimSize[MAX_TENSOR_DIM_NUM];

--- a/source/core/XMatrixSegment.cpp
+++ b/source/core/XMatrixSegment.cpp
@@ -227,7 +227,7 @@ int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex)
    x2 = colSize - 1;
    y2 = rowSize - 1;  // bottom-right corner

-                        /* the main body of the matrix (after removing the margin block) */
+    /* the main body of the matrix (after removing the margin block) */
    while (x1 <= xMax) {
        y1 = 0;
        x2 = x1 + colSize - 1;

--- a/source/core/XMatrixSegment.h
+++ b/source/core/XMatrixSegment.h
@@ -26,9 +26,7 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

-/*******************************************************************
-segmentation and parallel processing for 2d tensors (i.e., matrices)
-*/
+/* segmentation and parallel processing for 2d tensors (i.e., matrices) */
 /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
 extern "C"
 void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);

--- a/source/core/XTensorBLAS.cu
+++ b/source/core/XTensorBLAS.cu
@@ -28,9 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-/*
-matrix multiplication via cuda version BLAS
-*/
+/* matrix multiplication via cuda version BLAS */
 void CudaBLASMatrixMUL(cublasHandle_t * handle,
                       void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                       void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -85,9 +83,7 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
    }
 }

-/*
-matrix multiplication via cuda version BLAS
-*/
+/* matrix multiplication via cuda version BLAS */
 void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -143,7 +139,6 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 }

 /* matrix multiplication in batch and strided mode via cuda version BLAS */
-extern "C"
 void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
@@ -198,9 +193,7 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
    }
 }

-/*
-matrix multiplication via cuda version BLAS
-*/
+/* matrix multiplication via cuda version BLAS */
 void CudaBLASMatrixMULList(cublasHandle_t * handle,
                           XList * a, MATRIX_TRANS_TYPE transposedA,
                           XList * b, MATRIX_TRANS_TYPE transposedB,

--- a/source/core/XTensorCore.h
+++ b/source/core/XTensorCore.h
--- a/source/function/HardTanH.cpp
+++ b/source/function/HardTanH.cpp
@@ -25,7 +25,6 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

-
 /*
 hard tanh function 
 y =  1    if x > 1

--- a/source/function/HardTanH.cu
+++ b/source/function/HardTanH.cu
@@ -95,7 +95,6 @@ dy/dx = 1     if -1 <= x <= 1
 >> y - y of the function
 >> x - x of the function
 >> size - size of y/x
-
 */
 __global__ 
 void KernelHardtanhBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)

--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
@@ -49,7 +49,6 @@ void LogSoftmax(XTensor * x, XTensor * y, int leadDim)
                dimSize[i - 1] = -x->dimSize[i];
        }

-
        XMem * mem = x->mem;
        XTensor * max = NULL;
        XTensor * sum = NULL;
@@ -168,7 +167,6 @@ dE/dx = dE/dy * dy/dx

 log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})

-
  dy_i/dx_j 
 = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
 = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j

--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
@@ -41,7 +41,8 @@ void CudaLogSoftmax(XTensor * x, XTensor * y, int leadDim)
    ShowNTErrors("You should call LogSoftmax instead!");
 }

-/* log softmax forward computation (Cuda kernel)
+/* 
+log softmax forward computation (Cuda kernel)

 for each column j, let y_{i,j} and x_{i,j} are the output
 and state value for the i-th element of column j. We have
@@ -85,7 +86,8 @@ void KernelLogSoftmaxComputeByRow(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y
    }
 }

-/* log softmax forward computation (Cuda kernel)
+/* 
+log softmax forward computation (Cuda kernel)

 for each row i, let y_{i,j} and x_{i,j} are the output
 and state value for the j-th element of row i. We have
@@ -182,7 +184,7 @@ void CudaLogSoftmaxSumMax(XTensor * x, XTensor * y, int leadDim, XTensor * sum, 
 /*
 set dE/dx = exp(y)

->> dedu - dE/dy
+>> dedy - dE/dy
 >> dedx - dE/dx
 >> y - output of the function
 >> size - size of output
@@ -256,7 +258,9 @@ dE/dx_j += -gold_j
 >> gold - gold standard to measure error (or loss)
 >> y - output of the function
 >> x - input of the function
->> size - size of input/output
+>> rowNum - row number of the matrix
+>> colNum - column number of the matrix
+>> gNonZeroNum - 
 >> lossName - name of the loss function
 */
 __global__
@@ -293,7 +297,6 @@ dE/dx = dE/dy * dy/dx

 log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})

-
 dy_i/dx_j
 = d{log(e^{x_i} / \sum_{k} e^{x_k})}/dx_j
 = d{log(e^{x_i})}/dx_j - d{log(\sum_{k} e^{x_k})}/dx_j

--- a/source/function/Loss.cu
+++ b/source/function/Loss.cu
@@ -31,7 +31,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 loss function to measure the "number" of errors
 */

-
 /* 
 compute the loss 
 >> gold - gold standard

--- a/source/function/Rectify.cu
+++ b/source/function/Rectify.cu
@@ -88,7 +88,6 @@ dy/dx =  1    if x >= 0
 >> y - output of the function
 >> x - input of the function
 >> size - size of output/input
-
 */
 __global__ 
 void KernelRectifyBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)

--- a/source/function/Sigmoid.cpp
+++ b/source/function/Sigmoid.cpp
@@ -25,7 +25,6 @@

 namespace nts{ // namespace nts(NiuTrans.Tensor)

-
 /*
 sigmoid function y = 1/(1+exp(-x))
 >> x - input tensor

--- a/source/function/Sigmoid.cu
+++ b/source/function/Sigmoid.cu
@@ -95,7 +95,6 @@ sigmoid: y = 1/(1+exp(-x))
 >> y - output of the function
 >> x - input of the function
 >> size - size of output/input
-
 */
 __global__ 
 void KernelSigmoidBackward(DTYPE * dedy, DTYPE * dedx, DTYPE * gold, DTYPE * y, DTYPE * x, int size)
@@ -122,7 +121,6 @@ sigmoid: y = 1/(1+exp(-x))
 >> dedy - dE/dy
 >> dedx - dE/dx
 >> lossName - type of loss function, e.g., cross entropy
-
 */
 void CudaSigmoidBackward(XTensor * gold, XTensor * y, XTensor * x, 
                         XTensor * dedy, XTensor * dedx,

--- a/source/function/Softmax.cuh
+++ b/source/function/Softmax.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 #ifdef USE_CUDA

-
 /* softmax y = e^x / \sum_{i} e^{x_i} (Cuda version) */
 extern "C"
 void CudaSotmax(XTensor * input, XTensor * output, int leadDim);

--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
@@ -22,8 +22,10 @@
 #include "TConcatenate.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: concatenate a list of tensors along a given dimension.
-* In this case, 2 * (2, 1) -> (2, 2), dim=1.
+
+/* 
+case 1: concatenate a list of tensors along a given dimension.
+In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenate1()
 {
@@ -60,12 +62,12 @@ bool TestConcatenate1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][1] = { {2.0},
-                           {3.0} };
-    DTYPE answer[2][2] = { {0.0, 2.0},
-                           {1.0, 3.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+    DTYPE answer[2][2] = { {0.0F, 2.0F},
+                           {1.0F, 3.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -144,8 +146,9 @@ bool TestConcatenate1()
 #endif // USE_CUDA
 }

-/* case 2: concatenate a list of tensors along a given dimension.
-* In this case, 2 * (2, 1) -> (4, 1), dim=0.
+/* 
+case 2: concatenate a list of tensors along a given dimension.
+In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenate2()
 {
@@ -182,14 +185,14 @@ bool TestConcatenate2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][1] = { {2.0},
-                           {3.0} };
-    DTYPE answer[4][1] = { {0.0},
-                           {1.0},
-                           {2.0},
-                           {3.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+    DTYPE answer[4][1] = { {0.0F},
+                           {1.0F},
+                           {2.0F},
+                           {3.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -268,8 +271,9 @@ bool TestConcatenate2()
 #endif // USE_CUDA
 }

-/* case 3: concatenate a list of tensors along a given dimension.
-* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
+/* 
+case 3: concatenate a list of tensors along a given dimension.
+In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate3()
 {
@@ -306,12 +310,12 @@ bool TestConcatenate3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
-                           {4.0, 5.0} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
-                           {1.0, 4.0, 5.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
+                           {4.0F, 5.0F} };
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
+                           {1.0F, 4.0F, 5.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -390,8 +394,9 @@ bool TestConcatenate3()
 #endif // USE_CUDA
 }

-/* case 4: concatenate two tensors along a given dimension.
-* In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
+/* 
+case 4: concatenate two tensors along a given dimension.
+In this case, (2, 1), (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenate4()
 {
@@ -425,12 +430,12 @@ bool TestConcatenate4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
-                           {4.0, 5.0} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
-                           {1.0, 4.0, 5.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
+                           {4.0F, 5.0F} };
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
+                           {1.0F, 4.0F, 5.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -502,7 +507,6 @@ TODO!!
 */

 /* test for Concatenate Function */
-extern "C"
 bool TestConcatenate()
 {
    XPRINT(0, stdout, "[TEST CONCATENATE] concatenate a list of tensors or two tensors along a given dimension \n");

--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
@@ -19,12 +19,14 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-14
 */

-#include "TConcatenateSolely.h"
 #include "../XList.h"
+#include "TConcatenateSolely.h"

 namespace nts { // namespace nt(NiuTrans.Tensor)
-/* case 1: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2, 1) -> (2, 2), dim=1.
+
+/* 
+case 1: concatenate a list of tensors along a given dimension
+In this case, 2 * (2, 1) -> (2, 2), dim=1.
 */
 bool TestConcatenateSolely1()
 {
@@ -61,12 +63,12 @@ bool TestConcatenateSolely1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][1] = { {2.0},
-                           {3.0} };
-    DTYPE answer[2][2] = { {0.0, 2.0},
-                           {1.0, 3.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+    DTYPE answer[2][2] = { {0.0F, 2.0F},
+                           {1.0F, 3.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -145,8 +147,9 @@ bool TestConcatenateSolely1()
 #endif // USE_CUDA
    }

-/* case 2: concatenate a list of tensors along a given dimension
-* In this case, 2 * (2, 1) -> (4, 1), dim=0.
+/* 
+case 2: concatenate a list of tensors along a given dimension
+In this case, 2 * (2, 1) -> (4, 1), dim=0.
 */
 bool TestConcatenateSolely2()
 {
@@ -183,14 +186,14 @@ bool TestConcatenateSolely2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][1] = { {2.0},
-                           {3.0} };
-    DTYPE answer[4][1] = { {0.0},
-                           {1.0},
-                           {2.0},
-                           {3.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+    DTYPE answer[4][1] = { {0.0F},
+                           {1.0F},
+                           {2.0F},
+                           {3.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -269,8 +272,9 @@ bool TestConcatenateSolely2()
 #endif // USE_CUDA
 }

-/* case 3: concatenate a list of tensors along a given dimension
-* In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
+/* 
+case 3: concatenate a list of tensors along a given dimension
+In this case, (2, 1) + (2, 2) -> (2, 3), dim=1.
 */
 bool TestConcatenateSolely3()
 {
@@ -307,12 +311,12 @@ bool TestConcatenateSolely3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][1] = { {0.0},
-                           {1.0} };
-    DTYPE sData2[2][2] = { {2.0, 3.0},
-                           {4.0, 5.0} };
-    DTYPE answer[2][3] = { {0.0, 2.0, 3.0},
-                           {1.0, 4.0, 5.0} };
+    DTYPE sData1[2][1] = { {0.0F},
+                           {1.0F} };
+    DTYPE sData2[2][2] = { {2.0F, 3.0F},
+                           {4.0F, 5.0F} };
+    DTYPE answer[2][3] = { {0.0F, 2.0F, 3.0F},
+                           {1.0F, 4.0F, 5.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -397,7 +401,6 @@ TODO!!
 */

 /* test for ConcatenateSolely Function */
-extern "C"
 bool TestConcatenateSolely()
 {
    XPRINT(0, stdout, "[TEST CONCATENATESOLELY] concatenate a list of tensors along a given dimension \n");

--- a/source/test/TCopyIndexed.cpp
+++ b/source/test/TCopyIndexed.cpp
@@ -22,9 +22,11 @@
 #include "TCopyIndexed.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 copy indexed sub-tensors 
-* In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, 
-* srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
+
+/* 
+case 1 copy indexed sub-tensors 
+In this case, (3, 2, 3) -> (3, 2, 2), dim = 2, indexSize = 2, 
+srcIndex = [0, 2], tgtIndex = [0, 1], copyNum = 1.
 */
 bool TestCopyIndexed1()
 {
@@ -50,19 +52,19 @@ bool TestCopyIndexed1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData[3][2][3] = { { {0.0, -1.0, 2.0},
-                               {2.0, 1.0, 3.0} },
-                             { {1.0, 2.0, 4.0}, 
-                               {3.0, 1.0, 2.0}},
-                             { {-1.0, 3.0, 2.0}, 
-                               {1.0, -1.0, 0.0} } };
-
-    DTYPE answer[3][2][2] = { { {0.0, 2.0},
-                                {2.0, 3.0} },
-                              { {1.0, 4.0}, 
-                                {3.0, 2.0}},
-                              { {-1.0, 2.0}, 
-                                {1.0, 0.0} } };
+    DTYPE sData[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                               {2.0F, 1.0F, 3.0F} },
+                             { {1.0F, 2.0F, 4.0F}, 
+                               {3.0F, 1.0F, 2.0F}},
+                             { {-1.0F, 3.0F, 2.0F}, 
+                               {1.0F, -1.0F, 0.0F} } };
+
+    DTYPE answer[3][2][2] = { { {0.0F, 2.0F},
+                                {2.0F, 3.0F} },
+                              { {1.0F, 4.0F}, 
+                                {3.0F, 2.0F}},
+                              { {-1.0F, 2.0F}, 
+                                {1.0F, 0.0F} } };
    int dim = 2;
    int indexSize = 2;
    int srcIndex[2] = {0, 2};
@@ -131,7 +133,6 @@ TODO!!
 */

 /* test for CopyIndexed Function */
-extern "C"
 bool TestCopyIndexed()
 {
    XPRINT(0, stdout, "[TEST CopyIndexed] copy indexed sub-tensors \n");

--- a/source/test/TCopyValues.cpp
+++ b/source/test/TCopyValues.cpp
@@ -23,6 +23,7 @@
 #include "TCopyValues.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /* case 1: copy tensor s to tensor t */
 bool TestCopyValues1()
 {
@@ -36,11 +37,11 @@ bool TestCopyValues1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };

-    DTYPE scaleFactor = 2.0;
-    DTYPE shiftFactor = 0.5;
+    DTYPE scaleFactor = 2.0F;
+    DTYPE shiftFactor = 0.5F;

    /* CPU test */
    bool cpuTest = true;
@@ -105,7 +106,6 @@ TODO!!
 */

 /* test for CopyValues Function */
-extern "C"
 bool TestCopyValues()
 {
    XPRINT(0, stdout, "[TEST CopyValues] copy tensor s to tensor t \n");

--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
@@ -22,10 +22,11 @@
 #include "THardTanH.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /* case 1: hard tanh function */
 bool TestHardTanH1()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -35,7 +36,7 @@ bool TestHardTanH1()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];

-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -45,10 +46,10 @@ bool TestHardTanH1()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];

-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
-	                      {3.5, -4.5, 1.0} };
-	DTYPE answer[2][3] = { {0.5, -1.0, 1.0},
-	                       {1.0, -1.0, 1.0} };
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
+	                      {3.5F, -4.5F, 1.0F} };
+	DTYPE answer[2][3] = { {0.5F, -1.0F, 1.0F},
+	                       {1.0F, -1.0F, 1.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -86,25 +87,32 @@ bool TestHardTanH1()
 	gpuTest = yGPU->CheckData(answer, yUnitNum, 1e-4F);

 	/* destroy variables */
-	delete x, y, xGPU, yGPU;
-	delete[] xDimSize, yDimSize;
+	delete x;
+    delete y;
+    delete xGPU;
+    delete yGPU;
+	delete[] xDimSize;
+    delete[] yDimSize;

 	return cpuTest && gpuTest;
 #else
 	/* destroy variables */
-	delete x, y;
-	delete[] xDimSize, yDimSize;
+	delete x;
+    delete y;
+	delete[] xDimSize;
+    delete[] yDimSize;

 	return cpuTest;
 #endif // USE_CUDA
 }

-/* case 2: backward computation 
-* In this case, lossName=CROSSENTROPY.
+/*
+case 2: backward computation 
+In this case, lossName=CROSSENTROPY.
 */
 bool TestHardTanH2()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -114,7 +122,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];

-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -124,7 +132,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];

-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -134,7 +142,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];

-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -144,7 +152,7 @@ bool TestHardTanH2()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];

-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -154,16 +162,16 @@ bool TestHardTanH2()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];

-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
-	                      {3.5, -4.5, 1.0} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
-	                       {1.0, -1.0, 1.0} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
-	                         {1.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {-2.0, 1.0, -1.0},
-	                         {-1.0, 1.0, -1.0} };
-	DTYPE answer[2][3] = { {-2.0, 1.0, 0.0},
-	                       {0.0, 0.0, -1.0} };
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
+	                      {3.5F, -4.5F, 1.0F} };
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
+	                       {1.0F, -1.0F, 1.0F} };
+	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
+	                         {1.0F, 1.0F, 1.0F} };
+	DTYPE dedyData[2][3] = { {-2.0F, 1.0F, -1.0F},
+	                         {-1.0F, 1.0F, -1.0F} };
+	DTYPE answer[2][3] = { {-2.0F, 1.0F, 0.0F},
+	                       {0.0F, 0.0F, -1.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -226,12 +234,13 @@ bool TestHardTanH2()
 #endif // USE_CUDA
 }

-/* case 3: backward computation
-* In this case, lossName=SQUAREDERROR.
+/* 
+case 3: backward computation
+In this case, lossName=SQUAREDERROR.
 */
 bool TestHardTanH3()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -241,7 +250,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];

-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -251,7 +260,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];

-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -261,7 +270,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];

-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -271,7 +280,7 @@ bool TestHardTanH3()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];

-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -281,16 +290,16 @@ bool TestHardTanH3()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];

-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
-	                      {3.5, -4.5, 1.0} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
-	                      {1.0, -1.0, 1.0} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
-	                         {1.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {-0.5, -2.0, 0.0 },
-	                         {0.0, -2.0, 0.0 } };
-	DTYPE answer[2][3] = { {-0.5, -2.0, 0.0},
-	                       {0.0, 0.0, 0.0} };
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
+	                      {3.5F, -4.5F, 1.0F} };
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
+	                      {1.0F, -1.0F, 1.0F} };
+	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
+	                         {1.0F, 1.0F, 1.0F} };
+	DTYPE dedyData[2][3] = { {-0.5F, -2.0F, 0.0F },
+	                         {0.0F, -2.0F, 0.0F } };
+	DTYPE answer[2][3] = { {-0.5F, -2.0F, 0.0F},
+	                       {0.0F, 0.0F, 0.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -353,12 +362,13 @@ bool TestHardTanH3()
 #endif // USE_CUDA
 }

-/* case 4: backward computation
-* In this case, lossName=ONEHOTERROR.
+/* 
+case 4: backward computation
+In this case, lossName=ONEHOTERROR.
 */
 bool TestHardTanH4()
 {
-	/* a x tensor of size 2 * 3 */
+	/* a x tensor of size (2, 3) */
 	int xOrder = 2;
 	int * xDimSize = new int[xOrder];
 	xDimSize[0] = 2;
@@ -368,7 +378,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];

-	/* a y tensor of size 2 * 3 */
+	/* a y tensor of size (2, 3) */
 	int yOrder = 2;
 	int * yDimSize = new int[yOrder];
 	yDimSize[0] = 2;
@@ -378,7 +388,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < yOrder; i++)
 		yUnitNum *= yDimSize[i];

-	/* a gold tensor of size 2 * 3 */
+	/* a gold tensor of size (2, 3) */
 	int goldOrder = 2;
 	int * goldDimSize = new int[goldOrder];
 	goldDimSize[0] = 2;
@@ -388,7 +398,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < goldOrder; i++)
 		goldUnitNum *= goldDimSize[i];

-	/* a dedy tensor of size 2 * 3 */
+	/* a dedy tensor of size (2, 3) */
 	int dedyOrder = 2;
 	int * dedyDimSize = new int[dedyOrder];
 	dedyDimSize[0] = 2;
@@ -398,7 +408,7 @@ bool TestHardTanH4()
 	for (int i = 0; i < dedyOrder; i++)
 		dedyUnitNum *= dedyDimSize[i];

-	/* a dedx tensor of size 2 * 3 */
+	/* a dedx tensor of size (2, 3) */
 	int dedxOrder = 2;
 	int * dedxDimSize = new int[dedxOrder];
 	dedxDimSize[0] = 2;
@@ -408,16 +418,16 @@ bool TestHardTanH4()
 	for (int i = 0; i < dedxOrder; i++)
 		dedxUnitNum *= dedxDimSize[i];

-	DTYPE xData[2][3] = { {0.5, -1.0, 2.0},
-	                      {3.5, -4.5, 1.0} };
-	DTYPE yData[2][3] = { {0.5, -1.0, 1.0},
-	                      {1.0, -1.0, 1.0} };
-	DTYPE goldData[2][3] = { {1.0, 0.0, 1.0},
-	                         {0.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {-0.5, 0.0, 0.0},
-	                         {0.0, -2.0, 0.0} };
-	DTYPE answer[2][3] = { {-0.5, 0.0, 0.0},
-	                       {0.0, 0.0, 0.0} };
+	DTYPE xData[2][3] = { {0.5F, -1.0F, 2.0F},
+	                      {3.5F, -4.5F, 1.0F} };
+	DTYPE yData[2][3] = { {0.5F, -1.0F, 1.0F},
+	                      {1.0F, -1.0F, 1.0F} };
+	DTYPE goldData[2][3] = { {1.0F, 0.0F, 1.0F},
+	                         {0.0F, 1.0F, 1.0F} };
+	DTYPE dedyData[2][3] = { {-0.5F, 0.0F, 0.0F},
+	                         {0.0F, -2.0F, 0.0F} };
+	DTYPE answer[2][3] = { {-0.5F, 0.0F, 0.0F},
+	                       {0.0F, 0.0F, 0.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -486,10 +496,9 @@ TODO!!
 */

 /* test for HardTanH Function */
-extern "C"
 bool TestHardTanH()
 {
-	XPRINT(0, stdout, "[TEST HARDTANH] -------------\n");
+	XPRINT(0, stdout, "[TEST HARDTANH] test hardtanh and its backward computation \n");
 	bool returnFlag = true, caseFlag = true;

 	/* case 1 test */

--- a/source/test/TIdentity.cpp
+++ b/source/test/TIdentity.cpp
@@ -23,8 +23,10 @@
 #include "TIdentity.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test Identity function.
-* Identity function: y = x 
+
+/*
+case 1: test Identity function.
+Identity function: y = x 
 */
 bool TestIdentity1()
 {
@@ -38,10 +40,10 @@ bool TestIdentity1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE xData[2][3] = { {0.0, 1.0, 2.0}, 
-                          {0.5, 0.7, 1.4} };
-    DTYPE answer[2][3] = { {0.0, 1.0, 2.0}, 
-                          {0.5, 0.7, 1.4} };
+    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                          {0.5F, 0.7F, 1.4F} };
+    DTYPE answer[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                           {0.5F, 0.7F, 1.4F} };

    /* CPU test */
    bool cpuTest = true;
@@ -93,8 +95,9 @@ bool TestIdentity1()
 #endif // USE_CUDA
 }

-/* case 2: test IdentityBackward function.
-* IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
+/* 
+case 2: test IdentityBackward function.
+IdentityBackward function: dE/dx = dE/dy * dy/dx = dE/dy
 */
 bool TestIdentity2()
 {
@@ -107,9 +110,9 @@ bool TestIdentity2()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
-    DTYPE gData[1][3] = { {0.0, 0.0, 1.0} };
-    DTYPE dedxAnswer[3] = {0.090031, 0.244728, -0.334759};
+    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
+    DTYPE gData[1][3] = { {0.0F, 0.0F, 1.0F} };
+    DTYPE dedxAnswer[3] = {0.090031F, 0.244728F, -0.334759F};

    /* CPU test */
    bool cpuTest = true;
@@ -135,7 +138,7 @@ bool TestIdentity2()
    IdentityBackward(g, y, x, dedy, dedx, CROSSENTROPY);
    
    /* check result */
-    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -162,7 +165,7 @@ bool TestIdentity2()
    IdentityBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, CROSSENTROPY);
    
    /* check result */
-    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    gpuTest = dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -197,7 +200,6 @@ bool TestIdentity2()
 */

 /* test for Identity Function */
-extern "C"
 bool TestIdentity()
 {
    XPRINT(0, stdout, "[TEST Identity] identity function and its backward computation \n");
@@ -213,15 +215,15 @@ bool TestIdentity()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    ///* case 2 test */
-    //caseFlag = TestIdentity2();
+    /* case 2 test */
+    caseFlag = TestIdentity2();

-    //if (!caseFlag) {
-    //    returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 2 failed!\n");
-    //}
-    //else
-    //    XPRINT(0, stdout, ">> case 2 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");

    /* other cases test */
    /*

--- a/source/test/TLogSoftmax.cpp
+++ b/source/test/TLogSoftmax.cpp
@@ -23,8 +23,10 @@
 #include "TLogSoftmax.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test LogSoftmax function.
-* LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
+
+/* 
+case 1: test LogSoftmax function.
+LogSoftmax function: y = log(e^x / \sum_{i} e^{x_i})
 */
 bool TestLogSoftmax1()
 {
@@ -38,10 +40,10 @@ bool TestLogSoftmax1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE xData[2][3] = { {0.0, 1.0, 2.0}, 
-                          {0.5, 0.7, 1.4} };
-    DTYPE answer[2][3] = { {-2.4076, -1.4076, -0.4076}, 
-                           {-1.5435, -1.3435, -0.6435} };
+    DTYPE xData[2][3] = { {0.0F, 1.0F, 2.0F}, 
+                          {0.5F, 0.7F, 1.4F} };
+    DTYPE answer[2][3] = { {-2.4076F, -1.4076F, -0.4076F}, 
+                           {-1.5435F, -1.3435F, -0.6435F} };

    /* CPU test */
    bool cpuTest = true;
@@ -58,7 +60,7 @@ bool TestLogSoftmax1()
    LogSoftmax(x, y, 1);
    
    /* check result */
-    cpuTest = y->CheckData(answer, sUnitNum);
+    cpuTest = y->CheckData(answer, sUnitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -76,7 +78,7 @@ bool TestLogSoftmax1()
    LogSoftmax(xGPU, yGPU, 1);
    
    /* check result */
-    gpuTest = yGPU->CheckData(answer, sUnitNum);
+    gpuTest = yGPU->CheckData(answer, sUnitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -97,9 +99,10 @@ bool TestLogSoftmax1()
 #endif // USE_CUDA
 }

-/* case 2: test LogSoftmaxBackward function.
-* dE/dx = dE/dy * dy/dx
-* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+/* 
+case 2: test LogSoftmaxBackward function.
+dE/dx = dE/dy * dy/dx
+log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
 */
 bool TestLogSoftmax2()
 {
@@ -112,10 +115,10 @@ bool TestLogSoftmax2()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE xData[3] = {0.0, 1.0, 2.0};
-    DTYPE gData[3] = {0.5, 0.8, 1.5};
-    DTYPE yAnswer[3] = {-2.4076, -1.4076, -0.4076};
-    DTYPE dedxAnswer[3] = {-0.409969, -0.555272, -0.834759};
+    DTYPE xData[3] = {0.0F, 1.0F, 2.0F};
+    DTYPE gData[3] = {0.5F, 0.8F, 1.5F};
+    DTYPE yAnswer[3] = {-2.4076F, -1.4076F, -0.4076F};
+    DTYPE dedxAnswer[3] = {-0.409969F, -0.555272F, -0.834759F};

    /* CPU test */
    bool cpuTest = true;
@@ -141,7 +144,7 @@ bool TestLogSoftmax2()
    LogSoftmaxBackward(g, y, x, dedy, dedx, 0, CROSSENTROPY);
    
    /* check result */
-    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -168,7 +171,7 @@ bool TestLogSoftmax2()
    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 0, CROSSENTROPY);
    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -197,9 +200,10 @@ bool TestLogSoftmax2()
 #endif // USE_CUDA
 }

-/* case 3: test LogSoftmaxBackward function.
-* dE/dx = dE/dy * dy/dx
-* log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
+/* 
+case 3: test LogSoftmaxBackward function.
+dE/dx = dE/dy * dy/dx
+log softmax: y_i = log(e^{x_i} / \sum_{k} e^{x_k})
 */
 bool TestLogSoftmax3()
 {
@@ -213,10 +217,10 @@ bool TestLogSoftmax3()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE xData[1][3] = { {0.0, 1.0, 2.0} };
-    DTYPE gData[1][3] = { {0.5, 0.8, 1.5} };
-    DTYPE yAnswer[1][3] = {-2.4076, -1.4076, -0.4076};
-    DTYPE dedxAnswer[1][3] = {-0.409969, -0.555272, -0.834759};
+    DTYPE xData[1][3] = { {0.0F, 1.0F, 2.0F} };
+    DTYPE gData[1][3] = { {0.5F, 0.8F, 1.5F} };
+    DTYPE yAnswer[1][3] = {-2.4076F, -1.4076F, -0.4076F};
+    DTYPE dedxAnswer[1][3] = {-0.409969F, -0.555272F, -0.834759F};

    /* CPU test */
    bool cpuTest = true;
@@ -242,7 +246,7 @@ bool TestLogSoftmax3()
    LogSoftmaxBackward(g, y, x, dedy, dedx, 1, CROSSENTROPY);
    
    /* check result */
-    cpuTest = y->CheckData(yAnswer, sUnitNum) && dedx->CheckData(dedxAnswer, sUnitNum);
+    cpuTest = y->CheckData(yAnswer, sUnitNum, 1e-4F) && dedx->CheckData(dedxAnswer, sUnitNum, 1e-4F);

 #ifdef USE_CUDA
    /* GPU test */
@@ -269,7 +273,7 @@ bool TestLogSoftmax3()
    LogSoftmaxBackward(gGPU, yGPU, xGPU, dedyGPU, dedxGPU, 1, CROSSENTROPY);
    
    /* check result */
-    gpuTest = yGPU->CheckData(yAnswer, sUnitNum) && dedxGPU->CheckData(dedxAnswer, sUnitNum);
+    gpuTest = yGPU->CheckData(yAnswer, sUnitNum, 1e-4F) && dedxGPU->CheckData(dedxAnswer, sUnitNum, 1e-4F);

    /* destroy variables */
    delete x;
@@ -305,7 +309,6 @@ bool TestLogSoftmax3()
 */

 /* test for LogSoftmax Function */
-extern "C"
 bool TestLogSoftmax()
 {
    XPRINT(0, stdout, "[TEST LogSoftmax] test log softmax function and its backward computation \n");
@@ -321,15 +324,15 @@ bool TestLogSoftmax()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    ///* case 2 test */
-    //caseFlag = TestLogSoftmax2();
+    /* case 2 test */
+    caseFlag = TestLogSoftmax2();

-    //if (!caseFlag) {
-    //    returnFlag = false;
-    //    XPRINT(0, stdout, ">> case 2 failed!\n");
-    //}
-    //else
-    //    XPRINT(0, stdout, ">> case 2 passed!\n");
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 2 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 2 passed!\n");

    /* case 3 test */
    caseFlag = TestLogSoftmax3();

--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
@@ -23,10 +23,12 @@
 #include "../function/Loss.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test LossCompute function 
-* In this case, Loss function name = SQUAREDERROR.
-* loss = sum_{i} 0.5*(t_i - y_i)^2, 
-* where t_i is the gold standard and y_i is the model output
+
+/* 
+case 1: test LossCompute function 
+In this case, Loss function name = SQUAREDERROR.
+loss = sum_{i} 0.5*(t_i - y_i)^2, 
+where t_i is the gold standard and y_i is the model output
 */
 bool TestLoss1()
 {
@@ -99,10 +101,11 @@ bool TestLoss1()
 #endif // USE_CUDA
 }

-/* case 2: test LossCompute function 
-* In this case, Loss function name = CROSSENTROPY.
-* loss = sum_{i} (-t_i * log(y_i))
-* where t_i is the gold standard and y_i is the model output
+/* 
+case 2: test LossCompute function 
+In this case, Loss function name = CROSSENTROPY.
+loss = sum_{i} (-t_i * log(y_i))
+where t_i is the gold standard and y_i is the model output
 */
 bool TestLoss2()
 {
@@ -175,10 +178,11 @@ bool TestLoss2()
 #endif // USE_CUDA
 }

-/* case 3: test LossCompute function 
-* In this case, Loss function name = ONEHOTERROR.
-* loss = sum_{i} e_i
-* where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
+/* 
+case 3: test LossCompute function 
+In this case, Loss function name = ONEHOTERROR.
+loss = sum_{i} e_i
+where e_i = 0.5*(t_i - y_i)^2 if t_i = 1, e_i = 0 otherwise
 */
 bool TestLoss3()
 {
@@ -191,16 +195,16 @@ bool TestLoss3()
    int unitNum = 1;
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];
-    DTYPE outputData[5][1] = { {0.5},
-                               {0.5},
-                               {0.5},
-                               {0.5},
-                               {0.5} };
-    DTYPE goldData[5][1] = { {1.0},
-                             {1.0},
-                             {0.0},
-                             {0.0},
-                             {0.0} };
+    DTYPE outputData[5][1] = { {0.5F},
+                               {0.5F},
+                               {0.5F},
+                               {0.5F},
+                               {0.5F} };
+    DTYPE goldData[5][1] = { {1.0F},
+                             {1.0F},
+                             {0.0F},
+                             {0.0F},
+                             {0.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -263,7 +267,6 @@ TODO!!
 */

 /* test for Loss Function */
-extern "C"
 bool TestLoss()
 {
    XPRINT(0, stdout, "[TEST Loss] compute the loss \n");

--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
@@ -22,9 +22,10 @@
 #include "TMatrixMULBatchedCPU.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication in batch mode (CPU code). 
-* In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+
+/* 
+case 1: matrix multiplication in batch mode (CPU code). 
+In this case, aList=2*(2, 3), bList=2*(3, 2) -> c=2*(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatchedCPU1()
 {
@@ -63,20 +64,20 @@ bool TestMatrixMulBatchedCPU1()
    for (int i = 0; i < cOrder; i++)
        cUnitNum *= cDimSize[i];

-    DTYPE aData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE aData2[2][3] = { {1.0, -2.0, -3.0},
-                           {-4.0, 3.0, 2.0} };
-    DTYPE bData1[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE bData2[3][2] = { {0.0, 1.0},
-                           {3.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer1[2][2] = { {8.0, 6.0}, 
-                            {17.0, 20.0} };
-    DTYPE answer2[2][2] = { {-12.0, -6.0}, 
-                            {13.0, 4.0} };
+    DTYPE aData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE aData2[2][3] = { {1.0F, -2.0F, -3.0F},
+                           {-4.0F, 3.0F, 2.0F} };
+    DTYPE bData1[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE bData2[3][2] = { {0.0F, 1.0F},
+                           {3.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer1[2][2] = { {8.0F, 6.0F}, 
+                            {17.0F, 20.0F} };
+    DTYPE answer2[2][2] = { {-12.0F, -6.0F}, 
+                            {13.0F, 4.0F} };

    /* CPU test */
    bool cpuTest = true;

--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
@@ -22,9 +22,11 @@
 #include "TMatrixMul.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication. 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+
+/* 
+case 1: matrix multiplication. 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul1()
 {
@@ -58,13 +60,13 @@ bool TestMatrixMul1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -130,9 +132,10 @@ bool TestMatrixMul1()
 #endif // USE_CUDA
 }

-/* case 2: matrix multiplication. 
-* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_TRANS, transposedB=X_NOTRANS.
+/* 
+case 2: matrix multiplication. 
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+ transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2()
 {
@@ -166,14 +169,14 @@ bool TestMatrixMul2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2] = { {1.0, -4.0},
-                           {2.0, 5.0},
-                           {3.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0},
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
-                           {17.0, 20.0} };
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+                           {2.0F, 5.0F},
+                           {3.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F},
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -239,9 +242,10 @@ bool TestMatrixMul2()
 #endif // USE_CUDA
 }

-/* case 3: matrix multiplication. 
-* In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+/* 
+case 3: matrix multiplication. 
+In this case, a=(3, 2, 3), b=(2, 3, 2) -> c=(3, 2, 2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul3()
 {
@@ -279,30 +283,30 @@ bool TestMatrixMul3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0, 1.0, 3.0} },
-                              { {1.0, 2.0, 4.0}, 
-                                {3.0, 1.0, 2.0}},
-                              { {-1.0, 3.0, 2.0}, 
-                                {1.0, -1.0, 0.0} } };
-    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
-                                {-4.0, 3.0},
-                                {2.0, 6.0} },
-                              { {1.0, 2.0},
-                                {3.0, 4.0},
-                                {5.0, 6.0} } };
-    DTYPE answer[3][2][2][2] = { { { {8.0, 9.0}, 
-                                     {4.0, 25.0} },
-                                   { {7.0, 8.0},
-                                     {20.0, 26.0} } },
-                                 { { {1.0, 32.0},
-                                     {3.0, 21.0} },
-                                   { {27.0, 34.0}, 
-                                     {16.0, 22.0} } },
-                                 { { {-9.0, 19.0},
-                                     {5.0, -1.0} },
-                                   { {18.0, 22.0}, 
-                                     {-2.0, -2.0} } } };
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F}},
+                              { {-1.0F, 3.0F, 2.0F}, 
+                                {1.0F, -1.0F, 0.0F} } };
+    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
+                                {-4.0F, 3.0F},
+                                {2.0F, 6.0F} },
+                              { {1.0F, 2.0F},
+                                {3.0F, 4.0F},
+                                {5.0F, 6.0F} } };
+    DTYPE answer[3][2][2][2] = { { { {8.0F, 9.0F}, 
+                                     {4.0F, 25.0F} },
+                                   { {7.0F, 8.0F},
+                                     {20.0F, 26.0F} } },
+                                 { { {1.0F, 32.0F},
+                                     {3.0F, 21.0F} },
+                                   { {27.0F, 34.0F}, 
+                                     {16.0F, 22.0F} } },
+                                 { { {-9.0F, 19.0F},
+                                     {5.0F, -1.0F} },
+                                   { {18.0F, 22.0F}, 
+                                     {-2.0F, -2.0F} } } };

    /* CPU test */
    bool cpuTest = true;
@@ -368,9 +372,10 @@ bool TestMatrixMul3()
 #endif // USE_CUDA
 }

-/* case 4: matrix multiplication. 
-* In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+/* 
+case 4: matrix multiplication. 
+In this case, a=(3, 2, 3), b=(3, 2) -> c=(3, 2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul4()
 {
@@ -406,21 +411,21 @@ bool TestMatrixMul4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0, 1.0, 3.0} },
-                              { {1.0, 2.0, 4.0}, 
-                                {3.0, 1.0, 2.0}},
-                              { {-1.0, 3.0, 2.0}, 
-                                {1.0, -1.0, 0.0} } };
-    DTYPE sData2[3][2] = { {1.0, 2.0},
-                           {3.0, 4.0},
-                           {5.0, 6.0} };
-    DTYPE answer[3][2][2] = { { {7.0, 8.0},
-                                {20.0, 26.0} },
-                              { {27.0, 34.0}, 
-                                 {16.0, 22.0} },
-                              { {18.0, 22.0}, 
-                                {-2.0, -2.0} } };
+    DTYPE sData1[3][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F}},
+                              { {-1.0F, 3.0F, 2.0F}, 
+                                {1.0F, -1.0F, 0.0F} } };
+    DTYPE sData2[3][2] = { {1.0F, 2.0F},
+                           {3.0F, 4.0F},
+                           {5.0F, 6.0F} };
+    DTYPE answer[3][2][2] = { { {7.0F, 8.0F},
+                                {20.0F, 26.0F} },
+                              { {27.0F, 34.0F}, 
+                                 {16.0F, 22.0F} },
+                              { {18.0F, 22.0F}, 
+                                {-2.0F, -2.0F} } };

    /* CPU test */
    bool cpuTest = true;
@@ -493,7 +498,6 @@ bool TestMatrixMul4()
 */

 /* test for MatrixMul Function */
-extern "C"
 bool TestMatrixMul()
 {
    XPRINT(0, stdout, "[TEST MATRIXMUL] matrix multiplication \n");

--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
@@ -22,9 +22,11 @@
 #include "TMatrixMul2D.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication (for 2d tensors). 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+
+/* 
+case 1: matrix multiplication (for 2d tensors). 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D1()
 {
@@ -58,13 +60,13 @@ bool TestMatrixMul2D1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -130,9 +132,10 @@ bool TestMatrixMul2D1()
 #endif // USE_CUDA
 }

-/* case 2: matrix multiplication (for 2d tensors).
-* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_TRANS, transposedB=X_NOTRANS.
+/* 
+case 2: matrix multiplication (for 2d tensors).
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2D2()
 {
@@ -166,14 +169,14 @@ bool TestMatrixMul2D2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2] = { {1.0, -4.0},
-                           {2.0, 5.0},
-                           {3.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0},
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
-                           {17.0, 20.0} };
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+                           {2.0F, 5.0F},
+                           {3.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F},
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;

--- a/source/test/TMatrixMul2DParallel.cpp
+++ b/source/test/TMatrixMul2DParallel.cpp
@@ -22,9 +22,11 @@
 #include "TMatrixMul2DParallel.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication (for 2d tensors) with multi-threading. 
-* In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+
+/*
+case 1: matrix multiplication (for 2d tensors) with multi-threading. 
+In this case, a=(2, 3), b=(3, 2) -> c=(2, 2), 
+transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2DParallel1()
 {
@@ -58,13 +60,13 @@ bool TestMatrixMul2DParallel1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -96,9 +98,10 @@ bool TestMatrixMul2DParallel1()
    return cpuTest;
 }

-/* case 2: matrix multiplication (for 2d tensors) with multi-threading.
-* In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
-* transposedA=X_TRANS, transposedB=X_NOTRANS.
+/* 
+case 2: matrix multiplication (for 2d tensors) with multi-threading.
+In this case, a=(3, 2), b=(3, 2) -> c=(2, 2), 
+transposedA=X_TRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMul2DParallel2()
 {
@@ -132,14 +135,14 @@ bool TestMatrixMul2DParallel2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[3][2] = { {1.0, -4.0},
-                           {2.0, 5.0},
-                           {3.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0},
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0},
-                           {17.0, 20.0} };
+    DTYPE sData1[3][2] = { {1.0F, -4.0F},
+                           {2.0F, 5.0F},
+                           {3.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F},
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F},
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -177,7 +180,6 @@ bool TestMatrixMul2DParallel2()
 */

 /* test for MatrixMul2DParallel Function */
-extern "C"
 bool TestMatrixMul2DParallel()
 {
    XPRINT(0, stdout, "[TEST MatrixMul2DParallel] matrix multiplication (for 2d tensors) with multi-threading \n");

--- a/source/test/TMatrixMulBatched.cpp
+++ b/source/test/TMatrixMulBatched.cpp
@@ -22,9 +22,10 @@
 #include "TMatrixMULBatched.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: matrix multiplication of the two tensors. 
-* In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS,
-  transposedB=X_NOTRANS.
+
+/* 
+case 1: matrix multiplication of the two tensors. 
+In this case, a=(2, 3), b=(2, 3) -> c=(2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatched1()
 {
@@ -58,13 +59,13 @@ bool TestMatrixMulBatched1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][3] = { {1.0, 2.0, 3.0},
-                           {-4.0, 5.0, 6.0} };
-    DTYPE sData2[3][2] = { {0.0, -1.0},
-                           {1.0, 2.0}, 
-                           {2.0, 1.0} };
-    DTYPE answer[2][2] = { {8.0, 6.0}, 
-                           {17.0, 20.0} };
+    DTYPE sData1[2][3] = { {1.0F, 2.0F, 3.0F},
+                           {-4.0F, 5.0F, 6.0F} };
+    DTYPE sData2[3][2] = { {0.0F, -1.0F},
+                           {1.0F, 2.0F}, 
+                           {2.0F, 1.0F} };
+    DTYPE answer[2][2] = { {8.0F, 6.0F}, 
+                           {17.0F, 20.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -130,9 +131,9 @@ bool TestMatrixMulBatched1()
 #endif // USE_CUDA
 }

-/* case 2: matrix multiplication of the two tensors. 
-* In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), 
-* transposedA=X_NOTRANS, transposedB=X_NOTRANS.
+/*
+case 2: matrix multiplication of the two tensors. 
+In this case, a=(2, 2, 3), b=(2, 3, 2) -> c=(2, 2, 2), transposedA=X_NOTRANS, transposedB=X_NOTRANS.
 */
 bool TestMatrixMulBatched2()
 {
@@ -169,20 +170,20 @@ bool TestMatrixMulBatched2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData1[2][2][3] = { { {0.0, -1.0, 2.0},
-                                {2.0, 1.0, 3.0} },
-                              { {1.0, 2.0, 4.0}, 
-                                {3.0, 1.0, 2.0} } };
-    DTYPE sData2[2][3][2] = { { {1.0, 2.0},
-                                {-4.0, 3.0},
-                                {2.0, 6.0} },
-                              { {1.0, 2.0},
-                                {3.0, 4.0},
-                                {5.0, 6.0} } };
-    DTYPE answer[2][2][2] = { { {8.0, 9.0}, 
-                                {4.0, 25.0} },
-                              { {27.0, 34.0}, 
-                                {16.0, 22.0} } };
+    DTYPE sData1[2][2][3] = { { {0.0F, -1.0F, 2.0F},
+                                {2.0F, 1.0F, 3.0F} },
+                              { {1.0F, 2.0F, 4.0F}, 
+                                {3.0F, 1.0F, 2.0F} } };
+    DTYPE sData2[2][3][2] = { { {1.0F, 2.0F},
+                                {-4.0F, 3.0F},
+                                {2.0F, 6.0F} },
+                              { {1.0F, 2.0F},
+                                {3.0F, 4.0F},
+                                {5.0F, 6.0F} } };
+    DTYPE answer[2][2][2] = { { {8.0F, 9.0F}, 
+                                {4.0F, 25.0F} },
+                              { {27.0F, 34.0F}, 
+                                {16.0F, 22.0F} } };

    /* CPU test */
    bool cpuTest = true;
@@ -254,7 +255,6 @@ bool TestMatrixMulBatched2()
 */

 /* test for TestMatrixMulBatched Function */
-extern "C"
 bool TestMatrixMulBatched()
 {
    XPRINT(0, stdout, "[TEST MATRIXMULBATCHED] matrix multiplication of the two tensors \n");

--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
@@ -24,8 +24,10 @@
 #include "TMerge.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: transform a tensor by merging it along with a dimension. 
-* In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
+
+/* 
+case 1: transform a tensor by merging it along with a dimension. 
+In this case, (3, 2) -> (6), whereToMerge=1, leadingDim=0.
 */
 bool TestMerge1()
 {
@@ -48,9 +50,9 @@ bool TestMerge1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
-                          {3.0, 4.0, 5.0} };
-    DTYPE answer[6] = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0};
+    DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
+                          {3.0F, 4.0F, 5.0F} };
+    DTYPE answer[6] = {0.0F, 1.0F, 2.0F, 3.0F, 4.0F, 5.0F};
    
    /* CPU test */
    bool cpuTest = true;
@@ -107,8 +109,9 @@ bool TestMerge1()
 #endif // USE_CUDA
 }

-/* case 2: transform a tensor by merging it along with a dimension. 
-* In this case, 
+/* 
+case 2: transform a tensor by merging it along with a dimension. 
+In this case, 
 (2, 2, 3) -> (4, 3), whereToMerge=1, leadingDim=0.
 (2, 2, 3) -> (2, 6), whereToMerge=2, leadingDim=0.
 */
@@ -145,16 +148,16 @@ bool TestMerge2()
    for (int i = 0; i < tOrder2; i++)
        tUnitNum2 *= tDimSize2[i];

-    DTYPE sData[2][2][3] = { { {0.0, 1.0, 2.0},
-                               {4.0, 5.0, 6.0} },
-                             { {-1.0, 2.0, 3.0},
-                               {-4.0, -5.0, -6.0} } };
-    DTYPE answer1[4][3] = { {0.0, 1.0, 2.0},
-                            {4.0, 5.0, 6.0},
-                            {-1.0, 2.0, 3.0},
-                            {-4.0, -5.0, -6.0} };
-    DTYPE answer2[2][6] = { {0.0, 1.0, 2.0, -1.0, 2.0, 3.0},
-                            {4.0, 5.0, 6.0, -4.0, -5.0, -6.0} };
+    DTYPE sData[2][2][3] = { { {0.0F, 1.0F, 2.0F},
+                               {4.0F, 5.0F, 6.0F} },
+                             { {-1.0F, 2.0F, 3.0F},
+                               {-4.0F, -5.0F, -6.0F} } };
+    DTYPE answer1[4][3] = { {0.0F, 1.0F, 2.0F},
+                            {4.0F, 5.0F, 6.0F},
+                            {-1.0F, 2.0F, 3.0F},
+                            {-4.0F, -5.0F, -6.0F} };
+    DTYPE answer2[2][6] = { {0.0F, 1.0F, 2.0F, -1.0F, 2.0F, 3.0F},
+                            {4.0F, 5.0F, 6.0F, -4.0F, -5.0F, -6.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -222,7 +225,8 @@ bool TestMerge2()
 #endif // USE_CUDA
 }

-/* case 3: merge small tensors into a big tensor. 
+/* 
+case 3: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (4, 4), whereToMerge=0.
 */
 bool TestMerge3()
@@ -240,10 +244,10 @@ bool TestMerge3()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0, 7.0} };
-    DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
-                           {-4.0, -5.0, -6.0, -7.0} };
+    DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };

    /* a target tensor of size (4, 4) */
    int tOrder = 2;
@@ -255,10 +259,10 @@ bool TestMerge3()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE answer[4][4] = { {0.0, 1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0, 7.0},
-                           {0.0, -1.0, -2.0, -3.0},
-                           {-4.0, -5.0, -6.0, -7.0} };
+    DTYPE answer[4][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F},
+                           {0.0F, -1.0F, -2.0F, -3.0F},
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -336,7 +340,8 @@ bool TestMerge3()
 #endif // USE_CUDA
 }

-/* case 4: merge small tensors into a big tensor. 
+/* 
+case 4: merge small tensors into a big tensor. 
 In this case, 2 * (2, 4) -> (2, 8), whereToMerge=1.
 */
 bool TestMerge4()
@@ -354,10 +359,10 @@ bool TestMerge4()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    DTYPE sData1[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0, 7.0} };
-    DTYPE sData2[2][4] = { {0.0, -1.0, -2.0, -3.0},
-                           {-4.0, -5.0, -6.0, -7.0} };
+    DTYPE sData1[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE sData2[2][4] = { {0.0F, -1.0F, -2.0F, -3.0F},
+                           {-4.0F, -5.0F, -6.0F, -7.0F} };

    /* a target tensor of size (4, 4) */
    int tOrder = 2;
@@ -369,8 +374,8 @@ bool TestMerge4()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE answer[2][8] = { {0.0, 1.0, 2.0, 3.0, 0.0, -1.0, -2.0, -3.0},
-                           {4.0, 5.0, 6.0, 7.0, -4.0, -5.0, -6.0, -7.0} };
+    DTYPE answer[2][8] = { {0.0F, 1.0F, 2.0F, 3.0F, 0.0F, -1.0F, -2.0F, -3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F, -4.0F, -5.0F, -6.0F, -7.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -454,7 +459,6 @@ bool TestMerge4()
 */

 /* test for Merge Function */
-extern "C"
 bool TestMerge()
 {
    XPRINT(0, stdout, "[TEST MERGE] transform a tensor by merging it alone with a dimension or merge small tensors into a big tensor\n");

--- a/source/test/TMultiplyElementWise.cpp
+++ b/source/test/TMultiplyElementWise.cpp
@@ -22,9 +22,11 @@
 #include "TMultiplyElementWise.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: element-wise product of two tensors
-* c(i) = a(i)*b(i) + \alpha * c(i) 
-* In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
+
+/* 
+case 1: element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i) 
+In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
 */
 bool TestMultiplyElementWise1()
 {
@@ -58,12 +60,12 @@ bool TestMultiplyElementWise1()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];

-	DTYPE sData1[2][1] = { {0.0}, 
-                           {1.0} };
-	DTYPE sData2[2][1] = { {2.0},
-                           {3.0} };
-	DTYPE answer[2][1] = { {0.0},
-                           {3.0} };
+	DTYPE sData1[2][1] = { {0.0F}, 
+                           {1.0F} };
+	DTYPE sData2[2][1] = { {2.0F},
+                           {3.0F} };
+	DTYPE answer[2][1] = { {0.0F},
+                           {3.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -129,9 +131,10 @@ bool TestMultiplyElementWise1()
 #endif // USE_CUDA
 }

-/* case 2: element-wise product of two tensors
-* c(i) = a(i)*b(i) + \alpha * c(i)
-* In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
+/* 
+case 2: element-wise product of two tensors
+c(i) = a(i)*b(i) + \alpha * c(i)
+In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
 bool TestMultiplyElementWise2()
 {
@@ -165,12 +168,12 @@ bool TestMultiplyElementWise2()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];

-	DTYPE sData1[2][2] = { {0.0, 1.0},
-	                       {2.0, 3.0} };
-	DTYPE sData2[2][2] = { {0.0, 1.0},
-	                       {2.0, 3.0} };
-	DTYPE answer[2][2] = { {0.0, 1.0},
-	                       {4.0, 9.0} };
+	DTYPE sData1[2][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F} };
+	DTYPE sData2[2][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F} };
+	DTYPE answer[2][2] = { {0.0F, 1.0F},
+	                       {4.0F, 9.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -236,8 +239,9 @@ bool TestMultiplyElementWise2()
 #endif // USE_CUDA
 }

-/* case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
-* In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
+/* 
+case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
+In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
 */
 bool TestMultiplyElementWise3()
 {
@@ -271,12 +275,12 @@ bool TestMultiplyElementWise3()
 	for (int i = 0; i < tOrder; i++)
 		tUnitNum *= tDimSize[i];

-	DTYPE sData1[2][2] = { {0.0, 1.0},
-	                       {2.0, 3.0} };
-	DTYPE sData2[2][2] = { {0.0, 1.0},
-	                       {2.0, 3.0} };
-	DTYPE answer[2][2] = { {0.0, 1.0},
-	                       {4.0, 9.0} };
+	DTYPE sData1[2][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F} };
+	DTYPE sData2[2][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F} };
+	DTYPE answer[2][2] = { {0.0F, 1.0F},
+	                       {4.0F, 9.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -348,7 +352,6 @@ TODO!!
 */

 /* test for MultiplyElementWise Function */
-extern "C"
 bool TestMultiplyElementWise()
 {
 	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");

--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
@@ -22,6 +22,7 @@
 #include "TNegate.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /* case 1: set every entry to its minus value */
 bool TestNegate1()
 {
@@ -35,12 +36,12 @@ bool TestNegate1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {1.0, -2.0}, 
-	                      {-3.0, 4.0},
-	                      {5.0, -6.0} };
-	DTYPE answer[3][2] = { {-1.0, 2.0},
-	                       {3.0, -4.0},
-	                       {-5.0, 6.0} };
+	DTYPE aData[3][2] = { {1.0F, -2.0F}, 
+	                      {-3.0F, 4.0F},
+	                      {5.0F, -6.0F} };
+	DTYPE answer[3][2] = { {-1.0F, 2.0F},
+	                       {3.0F, -4.0F},
+	                       {-5.0F, 6.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -101,12 +102,12 @@ bool TestNegate2()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0, 0.0},
-	                      {0.0, 0.0},
-	                      {0.0, 0.0} };
-	DTYPE answer[3][2] = { {-0.0, -0.0},
-	                       {-0.0, -0.0},
-	                       {-0.0, -0.0} };
+	DTYPE aData[3][2] = { {0.0F, 0.0F},
+	                      {0.0F, 0.0F},
+	                      {0.0F, 0.0F} };
+	DTYPE answer[3][2] = { {-0.0F, -0.0F},
+	                       {-0.0F, -0.0F},
+	                       {-0.0F, -0.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -160,7 +161,6 @@ TODO!!
 */

 /* test for Negate Function */
-extern "C"
 bool TestNegate()
 {
 	XPRINT(0, stdout, "[TEST NEGATE] set every entry to its minus value \n");

--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
@@ -22,10 +22,12 @@
 #include "TNormalize.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: normalized the data with normal distribution 
-* For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
-* where a and b are the scalar and bias respectively, 
-* and \epsilon is the adjustment parameter.
+
+/*
+case 1: normalized the data with normal distribution 
+For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b.
+where a and b are the scalar and bias respectively, 
+and \epsilon is the adjustment parameter.
 */
 bool TestNormalize1()
 {
@@ -87,14 +89,14 @@ bool TestNormalize1()
 	for (int i = 0; i < bOrder; i++)
 		bUnitNum *= bDimSize[i];

-	DTYPE sData[2][3] = { {1.0, 2.0, 3.0},
-	                      {1.5, 2.5, 3.5} };
-	DTYPE meanData[3] = {1.0, 1.5, 2.0};
-	DTYPE varData[3] = {1.0, 1.0, 4.0};
-    DTYPE aData[2][3] = { {1.0, 1.0, 1.0},
-	                      {1.0, 1.0, 1.0} };
-	DTYPE answer[2][3] = { {0.0, 0.5, 0.5},
-	                       {0.5, 1.0, 0.75} };
+	DTYPE sData[2][3] = { {1.0F, 2.0F, 3.0F},
+	                      {1.5F, 2.5F, 3.5F} };
+	DTYPE meanData[3] = {1.0F, 1.5F, 2.0F};
+	DTYPE varData[3] = {1.0F, 1.0F, 4.0F};
+    DTYPE aData[2][3] = { {1.0F, 1.0F, 1.0F},
+	                      {1.0F, 1.0F, 1.0F} };
+	DTYPE answer[2][3] = { {0.0F, 0.5F, 0.5F},
+	                       {0.5F, 1.0F, 0.75F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -116,7 +118,7 @@ bool TestNormalize1()
 	t->SetZeroAll();

 	/* call normalize function */
-	Normalize(s, t, 0, mean, var, a, b, 0.0);
+	Normalize(s, t, 0, mean, var, a, b, 0.0F);
    
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum, 1e-4, 0);
@@ -142,7 +144,7 @@ bool TestNormalize1()
 	tGPU->SetZeroAll();

 	/* call Normalize function */
-	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0);
+	Normalize(sGPU, tGPU, 0, meanGPU, varGPU, aGPU, bGPU, 0.0F);

 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum, 1e-4, 0);
@@ -193,7 +195,6 @@ TODO!!
 */

 /* test for Normalize Function */
-extern "C"
 bool TestNormalize()
 {
 	XPRINT(0, stdout, "[TEST NORMALIZE] normalized the data with normal distribution \n");

--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
@@ -23,8 +23,10 @@
 #include "TPower.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: get the power(a, p) 
-* In this case, p=2.
+
+/* 
+case 1: get the power(a, p) 
+In this case, p=2.
 */
 bool TestPower1()
 {
@@ -38,12 +40,12 @@ bool TestPower1()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {1.0, 2.0},
-	                      {3.0, 4.0},
-	                      {5.0, 6.0} };
-	DTYPE answer[3][2] = { {1.0, 4.0},
-	                       {9.0, 16.0},
-	                       {25.0, 36.0} };
+	DTYPE aData[3][2] = { {1.0F, 2.0F},
+	                      {3.0F, 4.0F},
+	                      {5.0F, 6.0F} };
+	DTYPE answer[3][2] = { {1.0F, 4.0F},
+	                       {9.0F, 16.0F},
+	                       {25.0F, 36.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -55,7 +57,7 @@ bool TestPower1()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 2.0);
+	Power(a, 2.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -71,7 +73,7 @@ bool TestPower1()
 	aGPU->SetData(aData, aUnitNum);

 	/* call power function */
-	Power(aGPU, 2.0);
+	Power(aGPU, 2.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
@@ -91,8 +93,9 @@ bool TestPower1()
 #endif // USE_CUDA
 }

-/* case 2: get the power(a, p)
-* In this case, p=1.
+/* 
+case 2: get the power(a, p)
+In this case, p=1.
 */
 bool TestPower2()
 {
@@ -106,12 +109,12 @@ bool TestPower2()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0, 1.0},
-	                      {2.0, 3.0},
-	                      {4.0, 5.0} };
-	DTYPE answer[3][2] = { {0.0, 1.0},
-	                       {2.0, 3.0},
-	                       {4.0, 5.0} };
+	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	                      {2.0F, 3.0F},
+	                      {4.0F, 5.0F} };
+	DTYPE answer[3][2] = { {0.0F, 1.0F},
+	                       {2.0F, 3.0F},
+	                       {4.0F, 5.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -123,7 +126,7 @@ bool TestPower2()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 1.0);
+	Power(a, 1.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -139,7 +142,7 @@ bool TestPower2()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(aGPU, 1.0);
+	Power(aGPU, 1.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
@@ -159,8 +162,9 @@ bool TestPower2()
 #endif // USE_CUDA
 }

-/* case 3: get the power(a, p)
-* In this case, p=0.
+/* 
+case 3: get the power(a, p)
+In this case, p=0.
 */
 bool TestPower3()
 {
@@ -174,12 +178,12 @@ bool TestPower3()
 	for (int i = 0; i < aOrder; i++)
 		aUnitNum *= aDimSize[i];

-	DTYPE aData[3][2] = { {0.0, 1.0},
-	                      {2.0, 3.0},
-	                      {4.0, 5.0} };
-	DTYPE answer[3][2] = { {1.0, 1.0},
-	                       {1.0, 1.0},
-	                       {1.0, 1.0} };
+	DTYPE aData[3][2] = { {0.0F, 1.0F},
+	                      {2.0F, 3.0F},
+	                      {4.0F, 5.0F} };
+	DTYPE answer[3][2] = { {1.0F, 1.0F},
+	                       {1.0F, 1.0F},
+	                       {1.0F, 1.0F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -191,7 +195,7 @@ bool TestPower3()
 	a->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(a, 0.0);
+	Power(a, 0.0F);

 	/* check results */
 	cpuTest = a->CheckData(answer, aUnitNum, 1e-4F);
@@ -207,7 +211,7 @@ bool TestPower3()
 	aGPU->SetData(aData, aUnitNum);

 	/* call Power function */
-	Power(aGPU, 0.0);
+	Power(aGPU, 0.0F);

 	/* check results */
 	gpuTest = aGPU->CheckData(answer, aUnitNum, 1e-4F);
@@ -233,7 +237,6 @@ TODO!!
 */

 /* test for Power Function */
-extern "C"
 bool TestPower()
 {
 	XPRINT(0, stdout, "[TEST POWER] get the power(a, p) \n");

--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
@@ -22,8 +22,10 @@
 #include "TRectify.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test rectify function
-* y = max(0, x) 
+
+/* 
+case 1: test rectify function
+In this case, y = max(0, x) 
 */
 bool TestRectify1()
 {
@@ -47,10 +49,10 @@ bool TestRectify1()
    for (int i = 0; i < yOrder; i++)
        yUnitNum *= yDimSize[i];

-    DTYPE xData[2][3] = { {0.0, -1.0, 2.0},
-                          {3.0, -4.0, -5.0} };
-    DTYPE answer[2][3] = { {0.0, 0.0, 2.0},
-                           {3.0, 0.0, 0.0} };
+    DTYPE xData[2][3] = { {0.0F, -1.0F, 2.0F},
+                          {3.0F, -4.0F, -5.0F} };
+    DTYPE answer[2][3] = { {0.0F, 0.0F, 2.0F},
+                           {3.0F, 0.0F, 0.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -107,10 +109,11 @@ bool TestRectify1()
 #endif // USE_CUDA
 }

-/* case 2: backward computation 
-* dE/dx = dE/dy * dy/dx 
-* rectified: y = max(0, x) 
-* In this case, lossName=CROSSENTROPY.
+/* 
+case 2: backward computation 
+dE/dx = dE/dy * dy/dx 
+rectified: y = max(0, x) 
+In this case, lossName=CROSSENTROPY.
 */
 bool TestRectify2()
 {
@@ -124,16 +127,16 @@ bool TestRectify2()
 	for (int i = 0; i < xOrder; i++)
 		xUnitNum *= xDimSize[i];

-	DTYPE xData[2][3] = { {1.0, 1.0, 2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE yData[2][3] = { {1.0, 1.0, 2.0},
-	                      {2.0, 4.0, 5.0} };
-	DTYPE goldData[2][3] = { {1.0, 1.0, 1.0},
-	                         {1.0, 1.0, 1.0} };
-	DTYPE dedyData[2][3] = { {-1.0, -1.0, -0.5},
-	                         {-0.5, -0.25, -0.2} };
-	DTYPE answer[2][3] = { {-1.0, -1.0, -0.5},
-	                       {-0.5, -0.25, -0.2} };
+	DTYPE xData[2][3] = { {1.0F, 1.0F, 2.0F},
+	                      {2.0F, 4.0F, 5.0F} };
+	DTYPE yData[2][3] = { {1.0F, 1.0F, 2.0F},
+	                      {2.0F, 4.0F, 5.0F} };
+	DTYPE goldData[2][3] = { {1.0F, 1.0F, 1.0F},
+	                         {1.0F, 1.0F, 1.0F} };
+	DTYPE dedyData[2][3] = { {-1.0F, -1.0F, -0.5F},
+	                         {-0.5F, -0.25F, -0.2F} };
+	DTYPE answer[2][3] = { {-1.0F, -1.0F, -0.5F},
+	                       {-0.5F, -0.25F, -0.2F} };

 	/* CPU test */
 	bool cpuTest = true;
@@ -215,7 +218,6 @@ TODO!!
 */

 /* test for Rectify Function */
-extern "C"
 bool TestRectify()
 {
    XPRINT(0, stdout, "[TEST RECTIFY] test rectify and its backward computation \n");

--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
@@ -22,8 +22,10 @@
 #include "TReduceMax.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: get the max value of the items along a dimension of the tensor. 
-* In this case,
+
+/* 
+case 1: get the max value of the items along a dimension of the tensor. 
+In this case,
 (2, 4) -> (4), dim = 0
 (2, 4) -> (2), dim = 1
 */
@@ -57,10 +59,10 @@ bool TestReduceMax1()
    for (int i = 0; i < tOrder2; i++)
        tUnitNum2 *= tDimSize2[i];

-    DTYPE sData[2][4] = { {0.0, 5.0, 2.0, 3.0},
-                          {4.0, 1.0, 6.0, 7.0} };
-    DTYPE answer1[4] = {4.0, 5.0, 6.0, 7.0};
-    DTYPE answer2[2] = {5.0, 7.0};
+    DTYPE sData[2][4] = { {0.0F, 5.0F, 2.0F, 3.0F},
+                          {4.0F, 1.0F, 6.0F, 7.0F} };
+    DTYPE answer1[4] = {4.0F, 5.0F, 6.0F, 7.0F};
+    DTYPE answer2[2] = {5.0F, 7.0F};

    /* CPU test */
    bool cpuTest = true;
@@ -134,7 +136,6 @@ TODO!!
 */

 /* test for ReduceMax Function */
-extern "C"
 bool TestReduceMax()
 {
    XPRINT(0, stdout, "[TEST ReduceMax] get the max value of the items along a dimension of the tensor\n");

--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
@@ -22,6 +22,7 @@
 #include "TReduceMean.h"

 namespace nts { // namespace nt(NiuTrans.Tensor)
+
 /* case 1: get the mean value along a dimension of the tensor */
 bool TestReduceMean1()
 {
@@ -53,10 +54,10 @@ bool TestReduceMean1()
    for (int i = 0; i < tOrder2; i++)
        tUnitNum2 *= tDimSize2[i];

-    DTYPE sData[2][4] = { { 0.0, 1.0, 2.0, 3.0 },
-                          { 4.0, 5.0, 6.0, 7.0 } };
-    DTYPE answer1[4] = {2.0, 3.0,  4.0,  5.0};
-    DTYPE answer2[2] = {1.5, 5.5};
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer1[4] = {2.0F, 3.0F, 4.0F, 5.0F};
+    DTYPE answer2[2] = {1.5F, 5.5F};

    /* CPU test */
    bool cpuTest = true;
@@ -124,104 +125,12 @@ bool TestReduceMean1()
 #endif // USE_CUDA
 }

-bool TestReduceMeanForLargescale()
-{
-    /* a tensor of size 10000 * 500 */
-    int order = 2;
-    int order_reduce = 1;
-    int * dimSize = new int[order];
-    dimSize[0] = 10000;
-    dimSize[1] = 500;
-
-    int unitNum = 1;
-    for (int i = 0; i < order; i++)
-        unitNum *= dimSize[i];
-    /* a tensor of size 500 */
-    int * dimSize_reduce_a = new int[order_reduce];
-    dimSize_reduce_a[0] = 500;
-
-    int unitNum_a = 1;
-    for (int i = 0; i < order_reduce; i++)
-        unitNum_a *= dimSize_reduce_a[i];
-    /* a tensor of size 10000 */
-    int * dimSize_reduce_b = new int[order_reduce];
-    dimSize_reduce_b[0] = 10000;
-
-    int unitNum_b = 1;
-    for (int i = 0; i < order_reduce; i++)
-        unitNum_b *= dimSize_reduce_b[i];
-
-    DTYPE * data = new DTYPE[5000000];
-    DTYPE * tmp = data;
-    for (int i = 0; i < unitNum; i++)
-        *tmp++ = 1;
-    DTYPE answer_a[500];
-    for (int i = 0; i < unitNum_a; i++)
-        answer_a[i] = 1;
-    DTYPE answer_b[10000];
-    for (int i = 0; i < unitNum_b; i++)
-        answer_b[i] = 1;
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(order, dimSize);
-    XTensor * reduce_a = NewTensor(order_reduce, dimSize_reduce_a);
-    XTensor * b = NewTensor(order, dimSize);
-    XTensor * reduce_b = NewTensor(order_reduce, dimSize_reduce_b);
-
-    /* initialize variables */
-    a->SetData(data, unitNum);
-    b->SetData(data, unitNum);
-    /* call reduce max function */
-    ReduceMean(a, reduce_a, 0);
-    ReduceMean(b, reduce_b, 1);
-
-    /* check results */
-    cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_aGPU = NewTensor(order_reduce, dimSize_reduce_a, X_FLOAT);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_bGPU = NewTensor(order_reduce, dimSize_reduce_b, X_FLOAT);
-
-    /* Initialize variables */
-    aGPU->SetData(data, unitNum);
-    bGPU->SetData(data, unitNum);
-
-    /* call reduce max function */
-    ReduceMean(aGPU, reduce_aGPU, 0);
-    ReduceMean(bGPU, reduce_bGPU, 1);
-
-    /* check results */
-    gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-
-    /* destroy variables */
-    delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-    delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-
-    return cpuTest;
-#endif // USE_CUDA
-}
-
 /* other cases */
 /*
 TODO!!
 */

 /* test for ReduceMean Function */
-extern "C"
 bool TestReduceMean()
 {
    XPRINT(0, stdout, "[TEST ReduceMean] get the mean value along a dimension of the tensor \n");
@@ -236,15 +145,6 @@ bool TestReduceMean()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    /* case 2 test */
-    caseFlag = TestReduceMeanForLargescale();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-
    ///* other cases test */
    ///*
    //TODO!!

--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
@@ -24,13 +24,13 @@

 #include "../core/ReduceMean.h"

-namespace nts { // namespace nt(NiuTrans.Tensor)
+namespace nts { // namespace nts(NiuTrans.Tensor)

 /* test for ReduceMean Function */
 extern "C"
 bool TestReduceMean();

-} // namespace nt(NiuTrans.Tensor)
+} // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_REDUCEMEAN_H__


--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
@@ -22,8 +22,10 @@
 #include "TReduceSum.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: sum the items along a dimension of the tensor.
-* In this case, 
+
+/* 
+case 1: sum the items along a dimension of the tensor.
+In this case, 
 (2, 4) -> (4), dim = 0
 (2, 4) -> (2), dim = 1
 */
@@ -57,10 +59,10 @@ bool TestReduceSum1()
    for (int i = 0; i < tOrder2; i++)
        tUnitNum2 *= tDimSize2[i];

-    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                           {4.0, 5.0, 6.0, 7.0} };
-    DTYPE answer1[4] = {4.0, 6.0, 8.0, 10.0};
-    DTYPE answer2[2] = {6.0, 22.0};
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer1[4] = {4.0F, 6.0F, 8.0F, 10.0F};
+    DTYPE answer2[2] = {6.0F, 22.0F};

    /* CPU test */
    bool cpuTest = true;
@@ -128,103 +130,12 @@ bool TestReduceSum1()
 #endif // USE_CUDA
 }

-bool TestReduceSumForLargescale()
-{
-    /* a tensor of size 10000 * 500 */
-    int order = 2;
-    int orderReduce = 1;
-    int * dimSize = new int[order];
-    dimSize[0] = 10000;
-    dimSize[1] = 500;
-
-    int unitNum = 1;
-    for (int i = 0; i < order; i++)
-        unitNum *= dimSize[i];
-    /* a tensor of size 500 */
-    int * dimSize_reduce_a = new int[orderReduce];
-    dimSize_reduce_a[0] = 500;
-
-    int unitNum_a = 1;
-    for (int i = 0; i < orderReduce; i++)
-        unitNum_a *= dimSize_reduce_a[i];
-    /* a tensor of size 10000 */
-    int * dimSize_reduce_b = new int[orderReduce];
-    dimSize_reduce_b[0] = 10000;
-
-    int unitNum_b = 1;
-    for (int i = 0; i < orderReduce; i++)
-        unitNum_b *= dimSize_reduce_b[i];
-
-    DTYPE * data = new DTYPE[5000000];
-    DTYPE * tmp = data;
-    for (int i = 0; i < unitNum; i++)
-        *tmp++ = 1;
-    DTYPE answer_a[500];
-    for (int i = 0; i < unitNum_a; i++)
-        answer_a[i] = 10000;
-    DTYPE answer_b[10000];
-    for (int i = 0; i < unitNum_b; i++)
-        answer_b[i] = 500;
-
-    /* CPU test */
-    bool cpuTest = true;
-
-    /* create tensors */
-    XTensor * a = NewTensor(order, dimSize);
-    XTensor * reduce_a = NewTensor(orderReduce, dimSize_reduce_a);
-    XTensor * b = NewTensor(order, dimSize);
-    XTensor * reduce_b = NewTensor(orderReduce, dimSize_reduce_b);
-
-    /* initialize variables */
-    a->SetData(data, unitNum);
-    b->SetData(data, unitNum);
-    /* call reduce sum function */
-    ReduceSum(a, reduce_a, 0);
-    ReduceSum(b, reduce_b, 1);
-
-    /* check results */
-    cpuTest = reduce_a->CheckData(answer_a, unitNum_a) && reduce_b->CheckData(answer_b, unitNum_b);
-
-#ifdef USE_CUDA
-    /* GPU test */
-    bool gpuTest = true;
-
-    /* create tensor */
-    XTensor * aGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_aGPU = NewTensor(orderReduce, dimSize_reduce_a, X_FLOAT);
-    XTensor * bGPU = NewTensor(order, dimSize, X_FLOAT);
-    XTensor * reduce_bGPU = NewTensor(orderReduce, dimSize_reduce_b, X_FLOAT);
-
-    /* Initialize variables */
-    aGPU->SetData(data, unitNum);
-    bGPU->SetData(data, unitNum);
-
-    /* call reduce max function */
-    ReduceSum(aGPU, reduce_aGPU, 0);
-    ReduceSum(bGPU, reduce_bGPU, 1);
-
-    /* check results */
-    gpuTest = reduce_aGPU->CheckData(answer_a, unitNum_a) && reduce_bGPU->CheckData(answer_b, unitNum_b);
-
-    /* destroy variables */
-    delete aGPU, bGPU, reduce_aGPU, reduce_bGPU;
-    delete[] dimSize, dimSize_reduce_a, dimSize_reduce_b;
-    return cpuTest && gpuTest;
-#else
-    /* destroy variables */
-    delete a;
-    delete b;
-    return cpuTest;
-#endif // USE_CUDA
-}
-
 /* other cases */
 /*
 TODO!!
 */

 /* test for ReduceSum Function */
-extern "C"
 bool TestReduceSum()
 {
    XPRINT(0, stdout, "[TEST ReduceSum] sum the items along a dimension of the tensor.\n");
@@ -239,15 +150,6 @@ bool TestReduceSum()
    else
        XPRINT(0, stdout, ">> case 1 passed!\n");

-    /* case 2 test */
-    caseFlag = TestReduceSumForLargescale();
-    if (!caseFlag) {
-        returnFlag = false;
-        XPRINT(0, stdout, ">> case 2 failed!\n");
-    }
-    else
-        XPRINT(0, stdout, ">> case 2 passed!\n");
-
    /* other cases test */
    /*
    TODO!!

--- a/source/test/TReduceSumSquared.cpp
+++ b/source/test/TReduceSumSquared.cpp
@@ -22,9 +22,11 @@
 #include "TReduceSumSquared.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: squared sum of the items along a dimension of the tensor. 
-* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
-* In this case, (2, 4) -> (4), dim = 0.
+
+/* 
+case 1: squared sum of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+In this case, (2, 4) -> (4), dim = 0.
 */
 bool TestReduceSumSquared1()
 {
@@ -56,10 +58,10 @@ bool TestReduceSumSquared1()
    for (int i = 0; i < shiftOrder; i++)
        shiftUnitNum *= shiftDimSize[i];

-    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE shiftData[4] = {1.0, -1.0, -1.0, 0.0};
-    DTYPE answer[4] = {10.0, 40.0, 58.0, 58.0};
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE shiftData[4] = {1.0F, -1.0F, -1.0F, 0.0F};
+    DTYPE answer[4] = {10.0F, 40.0F, 58.0F, 58.0F};

    /* CPU test */
    bool cpuTest = true;
@@ -125,9 +127,10 @@ bool TestReduceSumSquared1()
 #endif // USE_CUDA
 }

-/* case 1: squared sum of the items along a dimension of the tensor. 
-* For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
-* In this case, (2, 4) -> (2), dim = 1.
+/* 
+case 2: squared sum of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, sum = \sum_i (a_i - shift)^2.
+In this case, (2, 4) -> (2), dim = 1.
 */
 bool TestReduceSumSquared2()
 {
@@ -141,7 +144,7 @@ bool TestReduceSumSquared2()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    /* a output tensor of size (4) */
+    /* a output tensor of size (2) */
    int tOrder = 1;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -150,7 +153,7 @@ bool TestReduceSumSquared2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    /* a shift tensor of size (4) */
+    /* a shift tensor of size (2) */
    int shiftOrder = 1;
    int * shiftDimSize = new int[shiftOrder];
    shiftDimSize[0] = 2;
@@ -159,10 +162,10 @@ bool TestReduceSumSquared2()
    for (int i = 0; i < shiftOrder; i++)
        shiftUnitNum *= shiftDimSize[i];

-    DTYPE sData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE shiftData[2] = {-1.0, 1.0};
-    DTYPE answer[2] = {30.0, 86.0};
+    DTYPE sData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE shiftData[2] = {-1.0F, 1.0F};
+    DTYPE answer[2] = {30.0F, 86.0F};

    /* CPU test */
    bool cpuTest = true;
@@ -234,7 +237,6 @@ TODO!!
 */

 /* test for ReduceSumSquared Function */
-extern "C"
 bool TestReduceSumSquared()
 {
    XPRINT(0, stdout, "[TEST ReduceSumSquared] squared sum of the items along a dimension of the tensor\n");

--- a/source/test/TReduceVariance.cpp
+++ b/source/test/TReduceVariance.cpp
@@ -22,9 +22,11 @@
 #include "TReduceVariance.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: variance of the items along a dimension of the tensor. 
-* For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
-* In this case, (2, 4) -> (4), dim = 0.
+
+/*
+case 1: variance of the items along a dimension of the tensor. 
+For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2.
+In this case, (2, 4) -> (4), dim = 0.
 */
 bool TestReduceVariance1()
 {
@@ -131,7 +133,6 @@ TODO!!
 */

 /* test for ReduceVariance Function */
-extern "C"
 bool TestReduceVariance()
 {
    XPRINT(0, stdout, "[TEST ReduceVariance] variance of the items along a dimension of the tensor\n");

--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
@@ -22,8 +22,10 @@
 #include "TScaleAndShift.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: scale and shift all tensor entires.
-* p = p * scale + shift
+
+/* 
+case 1: scale and shift all tensor entires.
+p = p * scale + shift
 */
 bool TestScaleAndShift1()
 {
@@ -42,8 +44,8 @@ bool TestScaleAndShift1()
    DTYPE answer[2][4] = { {0.5F, 2.5F, 4.5F, 6.5F},
                           {8.5F, 10.5F, 12.5F, 14.5F} };

-    DTYPE scaleFactor = 2.0;
-    DTYPE shiftFactor = 0.5;
+    DTYPE scaleFactor = 2.0F;
+    DTYPE shiftFactor = 0.5F;

    /* CPU test */
    bool cpuTest = true;
@@ -97,7 +99,6 @@ TODO!!
 */

 /* test for ScaleAndShift Function */
-extern "C"
 bool TestScaleAndShift()
 {
    XPRINT(0, stdout, "[TEST ScaleAndShift] scale and shift all tensor entires\n");

--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
@@ -20,12 +20,14 @@
 */

 #include "TSelect.h"
+#include "../xc/Mycode.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test SelectRange function.
-* It can generate a tensor with seleccted data 
-* in range[low,high] along the given dimension.
-* In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
+
+/* 
+case 1: test SelectRange function.
+It can generate a tensor with seleccted data in range[low,high] along the given dimension.
+In this case, (2, 2, 4) -> (2, 2, 2), dim = 2, low = 1, high = 3.
 */
 bool TestSelect1()
 {
@@ -76,25 +78,25 @@ bool TestSelect1()

    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
-    return cpuTest;
+    
 #ifdef USE_CUDA
    /* GPU test */
    bool gpuTest = true;

    /* create tensors */
    XTensor * sGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
-    XTensor * tGPU = NewTensor(sOrder, sDimSize, X_FLOAT, 1.0F, 0);
+    XTensor * tGPU = NewTensor(tOrder, tDimSize, X_FLOAT, 1.0F, 0);

    /* initialize variables */
    sGPU->SetData(sData, sUnitNum);
    tGPU->SetZeroAll();

    /* call Select function */
-    SelectRange(sGPU, 1, 1, 3, tGPU);
+    SelectRange(sGPU, 2, 1, 3, tGPU);

    /* check results */
-    gpuTest = tGPU->CheckData(answer, sUnitNum);
-
+    gpuTest = tGPU->CheckData(answer, tUnitNum);
+    
    /* destroy variables */
    delete s;
    delete t;
@@ -121,7 +123,6 @@ TODO!!
 */

 /* test for Select Function */
-extern "C"
 bool TestSelect()
 {
    XPRINT(0, stdout, "[TEST Select] generate a tensor with seleccted data in range[low,high] along the given dimension \n");

--- a/source/test/TSetAscendingOrder.cpp
+++ b/source/test/TSetAscendingOrder.cpp
@@ -22,6 +22,7 @@
 #include "TSetAscendingOrder.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
+
 /* case 1: set the cell to the ascending order along a given dimension.
 */
 bool TestSetAscendingOrder1()
@@ -92,7 +93,6 @@ TODO!!
 */

 /* test for SetAscendingOrder Function */
-extern "C"
 bool TestSetAscendingOrder()
 {
    XPRINT(0, stdout, "[TEST SetAscendingOrder] set the cell to the ascending order along a given dimension \n");

--- a/source/test/TSetData.cpp
+++ b/source/test/TSetData.cpp
@@ -22,8 +22,8 @@
 #include "TSetData.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: set the cell to the ascending order along a given dimension.
-*/
+
+/* case 1: set the cell to the ascending order along a given dimension. */
 bool TestSetData1()
 {
    /* a input tensor of size (2, 4) */
@@ -83,7 +83,6 @@ TODO!!
 */

 /* test for SetData Function */
-extern "C"
 bool TestSetData()
 {
    XPRINT(0, stdout, "[TEST SetData] set the data of tensor \n");

--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
@@ -23,9 +23,11 @@
 #include "TSigmoid.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test Sigmoid function and SigmoidBackward function.
-* sigmoid function: y = 1/(1+exp(-x))
-* backward computation: dE/ds = dE/dy * dy/dx
+
+/* 
+case 1: test Sigmoid function and SigmoidBackward function.
+sigmoid function: y = 1/(1+exp(-x))
+backward computation: dE/ds = dE/dy * dy/dx
 */
 bool TestSigmoid1()
 {
@@ -124,9 +126,10 @@ bool TestSigmoid1()
 #endif // USE_CUDA
 }

-/* case 2: test Sigmoid function and SigmoidBackward function.
-* sigmoid function: y = 1/(1+exp(-x))
-* backward computation: dE/ds = dE/dy * dy/dx
+/* 
+case 2: test Sigmoid function and SigmoidBackward function.
+sigmoid function: y = 1/(1+exp(-x))
+backward computation: dE/ds = dE/dy * dy/dx
 */
 bool TestSigmoid2()
 {
@@ -234,7 +237,6 @@ bool TestSigmoid2()
 */

 /* test for Sigmoid Function */
-extern "C"
 bool TestSigmoid()
 {
    XPRINT(0, stdout, "[TEST SIGMOID] sigmoid function and its backward computation \n");

--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
@@ -24,8 +24,10 @@
 #include "TSoftmax.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test Softmax function.
-* softmax function: y = e^x / \sum_{i} e^{x_i}
+
+/* 
+case 1: test Softmax function.
+softmax function: y = e^x / \sum_{i} e^{x_i}
 */
 bool TestSoftmax1()
 {
@@ -96,8 +98,9 @@ bool TestSoftmax1()
 #endif // USE_CUDA
 }

-/* case 2: test SoftmaxBackward function.
-* SoftmaxBackward function: dE/dx_j = -gold_j + y_j
+/* 
+case 2: test SoftmaxBackward function.
+SoftmaxBackward function: dE/dx_j = -gold_j + y_j
 */
 bool TestSoftmax2()
 {
@@ -200,7 +203,6 @@ bool TestSoftmax2()
 */

 /* test for Softmax Function */
-extern "C"
 bool TestSoftmax()
 {
    XPRINT(0, stdout, "[TEST SOFTMAX] softmax function and its backward computation \n");

--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
@@ -22,7 +22,8 @@
 #include "TSort.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: sort the tensor along a given dimension*/
+
+/* case 1: sort the tensor along a given dimension */
 bool TestSort1()
 {
    /* a tensor of size (2, 4) */
@@ -35,10 +36,10 @@ bool TestSort1()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE aData[2][4] = { { 0.0F,   1.0F,   2.0F,   3.0F },
-                          { 4.0F,   5.0F,   6.0F,   7.0F } };
-    DTYPE answer[2][4] = { { 4.0F,   5.0F,  6.0F,  7.0F },
-                           { 0.0F,   1.0F,  2.0F,  3.0F } };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][4] = { {4.0F, 5.0F, 6.0F, 7.0F},
+                           {0.0F, 1.0F, 2.0F, 3.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -104,10 +105,10 @@ bool TestSort2()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE aData[2][4] = { { 0.0,   1.0,   2.0,   3.0 },
-                          { 4.0,   5.0,   6.0,   7.0 } };
-    DTYPE answer[2][4] = { { 3.0,   2.0,   1.0,   0.0 },
-                           { 7.0,   6.0,   5.0,   4.0 } };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][4] = { {3.0F, 2.0F, 1.0F, 0.0F},
+                           {7.0F, 6.0F, 5.0F, 4.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -166,7 +167,6 @@ TODO!!
 */

 /* test for Sort Function */
-extern "C"
 bool TestSort()
 {
    XPRINT(0, stdout, "[TEST SORT] sort the tensor along a given dimension \n");

--- a/source/test/TSplit.cpp
+++ b/source/test/TSplit.cpp
@@ -19,18 +19,17 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-13
 */

-#include "../XTensor.h"
-#include "../XDevice.h"
-#include "../core/Split.h"
-#include "../XList.h"
+#include "TSplit.h"

 namespace nts { // namespace nt(NiuTrans.Tensor)
-/* case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
-* In this case, 4 * 3 -> 2 * 2 * 3, whereToSplit=0, splitNum=2.
+
+/* 
+case 1: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+In this case, (4, 3) -> (2, 2, 3), whereToSplit=0, splitNum=2.
 */
 bool TestSplit1()
 {
-    /* a source tensor of size 4 * 3 */
+    /* a source tensor of size (4, 3) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 4;
@@ -40,7 +39,7 @@ bool TestSplit1()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    /* a target tensor of size 2 * 2 * 3 */
+    /* a target tensor of size (2, 2, 3) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -109,12 +108,13 @@ bool TestSplit1()
 #endif // USE_CUDA
 }

-/* case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
-* In this case, 3 * 4 -> 2 * 3 * 2, whereToSplit=1, splitNum=2.
+/* 
+case 2: transform a tensor by splitting it, e.g., (N, M) -> (N/3, M, 3)
+In this case, (3, 4) -> (2, 3, 2), whereToSplit=1, splitNum=2.
 */
 bool TestSplit2()
 {
-    /* a source tensor of size 3 * 4 */
+    /* a source tensor of size (3, 4) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 3;
@@ -124,7 +124,7 @@ bool TestSplit2()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    /* a target tensor of size 2 * 3 * 2 */
+    /* a target tensor of size (2, 3, 2) */
    int tOrder = 3;
    int * tDimSize = new int[tOrder];
    tDimSize[0] = 2;
@@ -194,8 +194,9 @@ bool TestSplit2()
 #endif // USE_CUDA
 }

-/* case 3: split a big tensor into small tensors
-* In this case, 3 * 4 -> 2 * (3 * 2) , whereToSplit=1, splitNum=2.
+/* 
+case 3: split a big tensor into small tensors
+In this case, (3, 4) -> 2 * (3, 2) , whereToSplit=1, splitNum=2.
 */
 bool TestSplit3()
 {
@@ -203,7 +204,7 @@ bool TestSplit3()
    XList tList;
    tList = XList();

-    /* a source tensor of size (3 * 4) */
+    /* a source tensor of size (3, 4) */
    int sOrder = 2;
    int * sDimSize = new int[sOrder];
    sDimSize[0] = 3;
@@ -213,7 +214,7 @@ bool TestSplit3()
    for (int i = 0; i < sOrder; i++)
        sUnitNum *= sDimSize[i];

-    /* a target tensor of size (3 * 2) */
+    /* a target tensor of size (3, 2) */
    int tOrder1 = 2;
    int * tDimSize1 = new int[tOrder1];
    tDimSize1[0] = 3;
@@ -313,10 +314,9 @@ TODO!!
 */

 /* test for Split Function */
-extern "C"
-    bool TestSplit()
+bool TestSplit()
 {
-    XPRINT(0, stdout, "[TEST SPLIT] -------------\n");
+    XPRINT(0, stdout, "[TEST SPLIT] split a big tensor into small tensors \n");
    bool returnFlag = true, caseFlag = true;

    /* case 1 test */

--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
@@ -22,7 +22,8 @@
 #include "TSum.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1 */
+
+/* case 1: tensor summation c = a + b * \beta */
 bool TestSum1()
 {
    /* a tensor of size (2, 4) */
@@ -35,12 +36,12 @@ bool TestSum1()
    for (int i = 0; i < order; i++)
        unitNum *= dimSize[i];

-    DTYPE aData[2][4] = { {0.0,   1.0,   2.0,   3.0},
-                          {4.0,   5.0,   6.0,   7.0} };
-    DTYPE bData[2][4] = { {1.0,  -1.0,  -3.0,  -5.0}, 
-                          {-7.0, -9.0, -11.0, -13.0} };
-    DTYPE answer[2][4] = { {1.0,   0.0,  -1.0,  -2.0},
-                           {-3.0, -4.0,  -5.0,  -6.0} };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
+    DTYPE answer[2][4] = { {1.0F, 0.0F, -1.0F, -2.0F},
+                           {-3.0F, -4.0F, -5.0F, -6.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -95,7 +96,7 @@ bool TestSum1()
 #endif // USE_CUDA
 }

-/* case 2 */
+/* case 2: tensor summation c = a + b * \beta */
 bool TestSum2()
 {
    /* a tensor of size (2, 4) */
@@ -108,12 +109,12 @@ bool TestSum2()
    for (int i = 0; i < order; i++) {
        unitNum *= dimSize[i];
    }
-    DTYPE aData[2][4] = { {0.0,   1.0,   2.0,   3.0},
-                          {4.0,   5.0,   6.0,   7.0} };
-    DTYPE bData[2][4] = { {1.0,  -1.0,  -3.0,  -5.0}, 
-                          {-7.0, -9.0, -11.0, -13.0} };
-    DTYPE answer[2][4] = { {0.5,  0.5,   0.5,   0.5},
-                           {0.5,  0.5,   0.5,   0.5} };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][4] = { {1.0F, -1.0F, -3.0F, -5.0F}, 
+                          {-7.0F, -9.0F, -11.0F, -13.0F} };
+    DTYPE answer[2][4] = { {0.5F, 0.5F, 0.5F, 0.5F},
+                           {0.5F, 0.5F, 0.5F, 0.5F} };
    float beta = 0.5F;

    /* CPU test */
@@ -129,7 +130,7 @@ bool TestSum2()
    b->SetData(bData, unitNum);
    c->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    Sum(a, b, c, beta);

    /* check results */
@@ -149,7 +150,7 @@ bool TestSum2()
    bGPU->SetData(bData, unitNum);
    cGPU->SetZeroAll();

-    /* call sum function */
+    /* call Sum function */
    Sum(aGPU, bGPU, cGPU, beta);

    /* check results */
@@ -182,8 +183,7 @@ bool TestSum2()
 */

 /* test for Sum Function */
-extern "C"
-bool TestSum() 
+bool TestSum()
 {
    XPRINT(0, stdout, "[TEST SUM] tensor summation c = a + b * beta\n");
    bool returnFlag = true, caseFlag = true;

--- a/source/test/TSumByColumnTV.cpp
+++ b/source/test/TSumByColumnTV.cpp
@@ -22,9 +22,10 @@
 #include "TSumByColumnTV.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test SumByColumnTV function
-* sum of a tensor and a vector (column vector) 
-* in a column by column manner
+
+/* 
+case 1: test SumByColumnTV function
+sum of a tensor and a vector (column vector) in a column by column manner
 */
 bool TestSumByColumnTV1()
 {
@@ -58,12 +59,12 @@ bool TestSumByColumnTV1()
    for (int i = 0; i < cOrder; i++)
        cUnitNum *= cDimSize[i];

-    DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE bData[2][1] = { {1.0},
-                          {0.0} };
-    DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
-                           {4.0, 5.0, 6.0, 7.0} };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -128,9 +129,9 @@ bool TestSumByColumnTV1()
 #endif // USE_CUDA
 }

-/* case 2: test SumByColumnTV function
-* sum of a tensor and a vector (column vector) 
-* in a column by column manner
+/* 
+case 2: test SumByColumnTV function
+sum of a tensor and a vector (column vector) in a column by column manner
 */
 bool TestSumByColumnTV2()
 {
@@ -154,12 +155,12 @@ bool TestSumByColumnTV2()
    for (int i = 0; i < bOrder; i++)
        bUnitNum *= bDimSize[i];

-    DTYPE aData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE bData[2][1] = { {1.0},
-                          {0.0} };
-    DTYPE answer[2][4] = { {1.0, 2.0, 3.0, 4.0},
-                           {4.0, 5.0, 6.0, 7.0} };
+    DTYPE aData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE bData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE answer[2][4] = { {1.0F, 2.0F, 3.0F, 4.0F},
+                           {4.0F, 5.0F, 6.0F, 7.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -222,7 +223,6 @@ bool TestSumByColumnTV2()
 */

 /* test for SumByColumnTV Function */
-extern "C"
 bool TestSumByColumnTV() 
 {
    XPRINT(0, stdout, "[TEST SumByColumnTV] sum of a tensor and a vector (column vector) in a column by column manner \n");

--- a/source/test/TSumByColumnVT.cpp
+++ b/source/test/TSumByColumnVT.cpp
@@ -22,9 +22,10 @@
 #include "TSumByColumnVT.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: test SumByColumnVT function
-* sum of a vector (column vector) and a tensor 
-* in a column by column manner
+
+/* 
+case 1: test SumByColumnVT function
+sum of a vector (column vector) and a tensor in a column by column manner
 */
 bool TestSumByColumnVT1()
 {
@@ -58,12 +59,12 @@ bool TestSumByColumnVT1()
    for (int i = 0; i < cOrder; i++)
        cUnitNum *= cDimSize[i];

-    DTYPE aData[2][1] = { {1.0},
-                          {0.0} };
-    DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE answer[2][1] = { {7.0},
-                           {22.0} };
+    DTYPE aData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][1] = { {7.0F},
+                           {22.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -129,9 +130,9 @@ bool TestSumByColumnVT1()
 #endif // USE_CUDA
 }

-/* case 2: test SumByColumnVT function
-* sum of a vector (column vector) and a tensor 
-* in a column by column manner
+/* 
+case 2: test SumByColumnVT function
+sum of a vector (column vector) and a tensor in a column by column manner
 */
 bool TestSumByColumnVT2()
 {
@@ -155,12 +156,12 @@ bool TestSumByColumnVT2()
    for (int i = 0; i < bOrder; i++)
        bUnitNum *= bDimSize[i];

-    DTYPE aData[2][1] = { {1.0},
-                          {0.0} };
-    DTYPE bData[2][4] = { {0.0, 1.0, 2.0, 3.0},
-                          {4.0, 5.0, 6.0, 7.0} };
-    DTYPE answer[2][1] = { {7.0},
-                           {22.0} };
+    DTYPE aData[2][1] = { {1.0F},
+                          {0.0F} };
+    DTYPE bData[2][4] = { {0.0F, 1.0F, 2.0F, 3.0F},
+                          {4.0F, 5.0F, 6.0F, 7.0F} };
+    DTYPE answer[2][1] = { {7.0F},
+                           {22.0F} };

    /* CPU test */
    bool cpuTest = true;
@@ -223,7 +224,6 @@ bool TestSumByColumnVT2()
 */

 /* test for SumByColumnVT Function */
-extern "C"
 bool TestSumByColumnVT() 
 {
    XPRINT(0, stdout, "[TEST SumByColumnVT] sum of a vector (column vector) and a tensor in a column by column manner \n");

--- a/source/test/TTopK.cpp
+++ b/source/test/TTopK.cpp
@@ -22,10 +22,12 @@
 #include "TTopK.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: get the top-k items along a given dimension.
-* In this case, 
-* (2, 4) -> (2, 4), dim = 0, k = 2
-* (2, 4) -> (2, 4), dim = 1, k = 4
+
+/* 
+case 1: get the top-k items along a given dimension.
+In this case, 
+(2, 4) -> (2, 4), dim = 0, k = 2
+(2, 4) -> (2, 4), dim = 1, k = 4
 */
 bool TestTopK1()
 {
@@ -49,16 +51,16 @@ bool TestTopK1()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
-                          {4.0, 3.0, 7.0, 6.0} };
+    DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
+                          {4.0F, 3.0F, 7.0F, 6.0F} };

-    DTYPE tAnswer1[2][4] = { {5.0, 3.0, 7.0, 8.0},
-                             {4.0, 1.0, 2.0, 6.0} };
+    DTYPE tAnswer1[2][4] = { {5.0F, 3.0F, 7.0F, 8.0F},
+                             {4.0F, 1.0F, 2.0F, 6.0F} };
    int indexAnswer1[2][4] = { {0, 1, 1, 0},
                               {1, 0, 0, 1} };

-    DTYPE tAnswer2[2][4] = { {8.0, 5.0, 2.0, 1.0},
-                             {7.0, 6.0, 4.0, 3.0} };
+    DTYPE tAnswer2[2][4] = { {8.0F, 5.0F, 2.0F, 1.0F},
+                             {7.0F, 6.0F, 4.0F, 3.0F} };
    int indexAnswer2[2][4] = { {3, 0, 2, 1},
                               {2, 3, 0, 1} };

@@ -156,9 +158,9 @@ bool TestTopK1()
 #endif // USE_CUDA
 }

-/* case 2: get the top-k items along a given dimension.
-* In this case, 
-* (2, 4) -> (2, 2), dim = 1, k = 2
+/*
+case 2: get the top-k items along a given dimension.
+In this case, (2, 4) -> (2, 2), dim = 1, k = 2.
 */
 bool TestTopK2()
 {
@@ -182,10 +184,10 @@ bool TestTopK2()
    for (int i = 0; i < tOrder; i++)
        tUnitNum *= tDimSize[i];

-    DTYPE sData[2][4] = { {5.0, 1.0, 2.0, 8.0},
-                          {4.0, 3.0, 7.0, 6.0} };
-    DTYPE tAnswer[2][2] = { {8.0, 5.0},
-                            {7.0, 6.0} };
+    DTYPE sData[2][4] = { {5.0F, 1.0F, 2.0F, 8.0F},
+                          {4.0F, 3.0F, 7.0F, 6.0F} };
+    DTYPE tAnswer[2][2] = { {8.0F, 5.0F},
+                            {7.0F, 6.0F} };
    int indexAnswer[2][2] = { {3, 0},
                              {2, 3} };

@@ -255,14 +257,12 @@ bool TestTopK2()
 #endif // USE_CUDA
 }

-
 /* other cases */
 /*
 TODO!!
 */

 /* test for TopK Function */
-extern "C"
 bool TestTopK()
 {
    XPRINT(0, stdout, "[TEST TopK] get the top-k items along a given dimension\n");

--- a/source/test/TUnsqueeze.cpp
+++ b/source/test/TUnsqueeze.cpp
@@ -19,15 +19,16 @@
 * $Created by: Xu Chen (email: hello_master1954@163.com) 2018-06-13
 */

-#include "../XTensor.h"
-#include "../core/Unsqueeze.h"
 #include "../XList.h"
+#include "TUnsqueeze.h"

 namespace nts { // namespace nts(NiuTrans.Tensor)
-/* case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
-* In this case, 
-* (2, 3) -> (2, 2, 3), dim=1, dSize=2
-* (2, 3) -> (2, 3, 2), dim=2, dSize=2
+
+/* 
+case 1: insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension)
+In this case, 
+(2, 3) -> (2, 2, 3), dim=1, dSize=2
+(2, 3) -> (2, 3, 2), dim=2, dSize=2
 */
 bool TestUnsqueeze1()
 {
@@ -63,18 +64,18 @@ bool TestUnsqueeze1()
    for (int i = 0; i < tOrder2; i++)
        tUnitNum2 *= tDimSize2[i];

-    DTYPE sData[2][3] = { {0.0, 1.0, 2.0},
-                          {3.0, 4.0, 5.0} };
-    DTYPE answer1[2][2][3] = { { {0.0, 1.0, 2.0},
-                                 {0.0, 1.0, 2.0} },
-                               { {3.0, 4.0, 5.0},
-                                 {3.0, 4.0, 5.0} } };
-    DTYPE answer2[2][3][2] = { { {0.0, 0.0}, 
-                                 {1.0, 1.0}, 
-                                 {2.0, 2.0} },
-                               { {3.0, 3.0}, 
-                                 {4.0, 4.0}, 
-                                 {5.0, 5.0} } };
+    DTYPE sData[2][3] = { {0.0F, 1.0F, 2.0F},
+                          {3.0F, 4.0F, 5.0F} };
+    DTYPE answer1[2][2][3] = { { {0.0F, 1.0F, 2.0F},
+                                 {0.0F, 1.0F, 2.0F} },
+                               { {3.0F, 4.0F, 5.0F},
+                                 {3.0F, 4.0F, 5.0F} } };
+    DTYPE answer2[2][3][2] = { { {0.0F, 0.0F}, 
+                                 {1.0F, 1.0F}, 
+                                 {2.0F, 2.0F} },
+                               { {3.0F, 3.0F}, 
+                                 {4.0F, 4.0F}, 
+                                 {5.0F, 5.0F} } };
    
    /* CPU test */
    bool cpuTest = true;
@@ -148,7 +149,6 @@ bool TestUnsqueeze1()
 */

 /* test for Unsqueeze Function */
-extern "C"
 bool TestUnsqueeze()
 {
    XPRINT(0, stdout, "[TEST Unsqueeze] insert a dimension by copying the blocks for x times\n");

--- a/source/test/TXMem.cpp
+++ b/source/test/TXMem.cpp
@@ -19,14 +19,13 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-6-24
 */

-#include "TXMem.h"
 #include "../XGlobal.h"
 #include "../XUtility.h"
-#include "../XMem.h"
+#include "TXMem.h"

-/* the nts (NiuTrans.Tensor) namespace */
-namespace nts{
+namespace nts{ // namespace nts(NiuTrans.Tensor)

+/* case 1: test memory pool class */
 bool TestXMemCase1()
 {
    bool ok = true;
@@ -83,6 +82,7 @@ bool TestXMemCase1()
    return ok;
 }

+/* test for memory pool class */
 bool TestXMem()
 {
    XPRINT(0, stdout, "[Test] Memory pool ... Began\n");
@@ -93,11 +93,18 @@ bool TestXMem()

    /* case 1 test */
    caseFlag = TestXMemCase1();
-    if (!caseFlag) { returnFlag = false; XPRINT(0, stdout, ">> case 1 failed!\n"); }
-    else {XPRINT(0, stdout, ">> case 1 passed!\n");}
+    if (!caseFlag) {
+        returnFlag = false;
+        XPRINT(0, stdout, ">> case 1 failed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> case 1 passed!\n");

-    if (returnFlag) { XPRINT(0, stdout, ">> All Passed!\n"); }
-    else { XPRINT(0, stdout, ">> Failed!\n"); }
+    if (returnFlag) {
+        XPRINT(0, stdout, ">> All Passed!\n");
+    }
+    else
+        XPRINT(0, stdout, ">> Failed!\n");

    double endT = GetClock();

@@ -106,4 +113,4 @@ bool TestXMem()
    return returnFlag;
 }

-} /* end of the nts (NiuTrans.Tensor) namespace */
+} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/test/TXMem.h
+++ b/source/test/TXMem.h
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp