This is a fundamental integration!

7e9d7015 · xuchen · b3a76184 · 7e9d7015 · 7e9d7015 · 7e9d7015
Commit 7e9d7015 authored Jul 20, 2018 by xuchen
--- a/source/network/XBackwardMath.cpp
+++ b/source/network/XBackwardMath.cpp
@@ -96,7 +96,7 @@ void XMathGrad::GradMultiply(XTensor * node)
    XNoder::MakeGrad(a);
    XNoder::MakeGrad(b);
-    CheckNTErrors(XTensor::IsIdentical(a, b), "Wrong sized input tensors!");
+    CheckNTErrors(XTensor::IsSameShaped(a, b), "Wrong sized input tensors!");
    _Multiply(node->grad, b, a->grad, 1.0F);
    _Multiply(node->grad, a, b->grad, 1.0F);
 }

--- a/source/network/XBackwardShape.cpp
+++ b/source/network/XBackwardShape.cpp
@@ -71,9 +71,11 @@ dE/da = split(dE/dc)
 void XShapeGrad::GradMerge(XTensor * node)
 {
    XLink &income = node->income;
-    CheckNTErrors(income.tailNum == 0, "Wrong input tensor number for MERGE!");
    XTensor * input = income.tails[0];
+    CheckNTErrors(income.tailNum == 1, "Wrong input tensor number for MERGE!");
+    CheckNTErrors(node->order == input->order - 1, "wrong tensor orders!");
    int whereToMerge = income.GetParamInt(0);
    int leadDim = income.GetParamInt(1);
@@ -95,13 +97,13 @@ void XShapeGrad::GradMerge(XTensor * node)
    }
    dims[0] = -dims[0];
    XTensor gradInputSmall(input->order - leadDim, dims,
-                            input->dataType, input->denseRatio, 
+                           input->dataType, input->denseRatio, 
-                            input->devID, input->mem);
+                           input->devID, input->mem);
    dims[whereToMerge - leadDim] *= dims[0];
-    XTensor gradNodeSmall(node->order - leadDim, dims,
+    XTensor gradNodeSmall(node->order - leadDim, dims + leadDim + 1,
-                            node->dataType, node->denseRatio, 
+                          node->dataType, node->denseRatio, 
-                            node->devID, node->mem);
+                          node->devID, node->mem);
    /* we can simply split the gradient tensor 
       if the input is used in merging only */
@@ -109,7 +111,7 @@ void XShapeGrad::GradMerge(XTensor * node)
        for(int i = 0; i < blockNum; i++){
            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
-            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim, input->dimSize[leadDim]);
+            _Split(&gradNodeSmall, &gradInputSmall, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
        }
    }
@@ -123,7 +125,7 @@ void XShapeGrad::GradMerge(XTensor * node)
        for(int i = 0; i < blockNum; i++){
            gradNodeSmall.data = (char*)node->grad->data + i * blockSize;
            gradInputSmall.data = (char*)input->grad->data + i * blockSize;
-            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim, input->dimSize[leadDim]);
+            _Split(&gradNodeSmall, &gradInputSmallBuf, whereToMerge - leadDim - 1, input->dimSize[leadDim]);
            _Sum(&gradInputSmall, &gradInputSmallBuf, &gradInputSmall);
        }
    }
@@ -162,7 +164,7 @@ void XShapeGrad::GradMergeList(XTensor * node)
        smallsGrad.Add(tail->grad);
        if(i > 1){
-            CheckNTErrors(XTensor::IsIdentical(last, tail), 
+            CheckNTErrors(XTensor::IsSameShaped(last, tail), 
                         "Input tensors must be of the same size!");
        }

--- a/source/network/XNoder.cpp
+++ b/source/network/XNoder.cpp
@@ -29,7 +29,7 @@ void XNoder::MakeGrad(XTensor * node)
    if(node == NULL)
        return;
-    if(!XTensor::IsIdentical(node, node->grad)){
+    if(!XTensor::IsSameShaped(node, node->grad)){
        delete node->grad;
        node->grad = NewTensor(node);
        node->grad->SetZeroAll();

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -73,8 +73,7 @@ void MakeWordBatch(XTensor &batch, NGram * ngrams, int ngramNum, int n, int vSiz
 void Forward(XTensor inputs[], XTensor &output, FNNModel &model, FNNNet &net);
 void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
              FNNModel &model, FNNModel &grad, FNNNet &net);
-void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NAME loss, 
+void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model);
-             FNNModel &model, XNet &net);
 /* 
 entry of the program 
@@ -415,7 +414,10 @@ void Train(const char * train, bool isShuffled, FNNModel &model)
            }
            else{
                /* forward + backward process */
-                FBInOne(inputs, output, gold, CROSSENTROPY, model, autoDiffer);
+                ForwardAutoDiff(inputs, output, model);
+                /* automatic differentiation */
+                autoDiffer.Backward(output, gold, CROSSENTROPY);
                /* update model parameters */
                Update(model, grad, learningRate, true);
@@ -902,17 +904,14 @@ void Backward(XTensor inputs[], XTensor &output, XTensor &gold, LOSS_FUNCTION_NA
 }
 /*
-forward + backward in one procedure
+forward process (with tensor connections)
 >> inputs - input word representations
 >> output - output probability
->> gold - gold standard
->> loss - loss function name
 >> model - the fnn model
 */
-void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold, 
+void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
-             LOSS_FUNCTION_NAME loss, FNNModel &model, XNet &net)
 {
-    int batchSize = gold.GetDim(0);
+    int batchSize = inputs[0].GetDim(0);
    int n = model.n;
    int depth = model.hDepth;
@@ -945,9 +944,6 @@ void FBInOne(XTensor inputs[], XTensor &output, XTensor &gold,
    /* output layer */
    output = LogSoftmax(MMul(hidden, model.outputW) + b, 1);
-    /* automatic differentiation */
-    net.Backward(output);
 }
 /* 

--- a/source/sample/fnnlm/FNNLM.h
+++ b/source/sample/fnnlm/FNNLM.h
@@ -127,7 +127,6 @@ struct FNNNet
 };
 /* entry of the program */
-extern "C" 
 int FNNLMMain(int argc, const char ** argv);
 };

--- a/source/tensor/XDataType.h
+++ b/source/tensor/XDataType.h
@@ -47,9 +47,9 @@ extern const char * GetDataTypeName(TENSOR_DATA_TYPE type);
 extern TENSOR_DATA_TYPE GetDataType(const char * typeName);
 /* data conversion (for lower precision computation) */
-extern "C" unsigned short FloatToFloat16(float f);
+unsigned short FloatToFloat16(float f);
-extern "C" float Float16ToFloat(unsigned short h);
+float Float16ToFloat(unsigned short h);
-extern "C" void ConvertDataType(int devID, 
+void ConvertDataType(int devID, 
                                void * s, TENSOR_DATA_TYPE typeS, 
                                void * t, TENSOR_DATA_TYPE typeT, int size);

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -321,7 +321,7 @@ void XLink::MakeLink(const XList * list, XTensor * h, int id)
            continue;
        XLink &outgo = t->outgo;
        CheckNTErrors(outgo.head == NULL || outgo.head == t, 
-                      "Wrong head of the hyperedge!");
+                     "Wrong head of the hyperedge!");
        outgo.SetHead(t);
        outgo.AddTail(h);
    }
@@ -349,6 +349,7 @@ void XLink::MakeLink(XTensor * t, XList * list, int id)
    /* backward */
    XLink &outgo = t->outgo;
+    outgo.SetHead(t);
    CheckNTErrors(outgo.head == NULL || outgo.head == t, "Wrong head of the hyperedge!");
    for(int i = 0; i < list->count; i++){
        XTensor * t = (XTensor*)list->GetItem(i);

--- a/source/tensor/XTensor.cpp
+++ b/source/tensor/XTensor.cpp
@@ -193,7 +193,7 @@ XTensor::~XTensor()
       the connectivity of the graph. To kill memory
       leak, we release the data of the new tensor
       when its parent is deleted (see ClearIncoming). */
-    if(isTmp && outgo.tailNum > 0){
+    if(outgo.tailNum > 0){
        int dims[MAX_TENSOR_DIM_NUM];
        memcpy(dims, dimSize, order * sizeof(int));
        dims[0] = -dims[0];
@@ -285,6 +285,27 @@ void XTensor::ShallowCopy(const XTensor &tensor)
 /* overloading of the equal-sign */
 XTensor& XTensor::operator= (const XTensor& tensor)
 {
+    /* we must make a hard copy of the tensor if it is the input 
+       of another node. */
+    if(outgo.tailNum > 0){
+        int dims[MAX_TENSOR_DIM_NUM];
+        memcpy(dims, dimSize, order * sizeof(int));
+        dims[0] = -dims[0];
+        XTensor * newTensor = new XTensor(order, dims, dataType, denseRatio, devID, mem);
+        newTensor->SetTMP();
+        newTensor->data = data;
+        newTensor->dataHost = dataHost;
+        XLink::Replace(this, newTensor);
+        XLink::ClearOutgoing(this);
+        XLink::ClearIncoming(this);
+        newTensor->ShallowCopy(this);
+        data = NULL;
+        dataHost = NULL;
+    }
    /* hard copy of the data array */
    int size = unitNum * unitSize;
    if( isInit && !isSparse && !tensor.isSparse &&
@@ -349,7 +370,7 @@ judge whether the two matrices are in the same type and size
 >> b - anther tensor to compare with
 << return - whether the two input tensors are identical
 */
-bool XTensor::IsIdentical(const XTensor * a, const XTensor * b)
+bool XTensor::IsSameShaped(const XTensor * a, const XTensor * b)
 {
    if(a == NULL || b == NULL)
        return false;
@@ -381,9 +402,9 @@ judge whether the three matrices are in the same type and size
 >> c - a tensor again
 << return - whether the two input tensors are identical
 */
-bool XTensor::IsIdentical(XTensor * a, XTensor * b, XTensor * c)
+bool XTensor::IsSameShaped(XTensor * a, XTensor * b, XTensor * c)
 {
-    return IsIdentical(a, b) && IsIdentical(a, c);
+    return IsSameShaped(a, b) && IsSameShaped(a, c);
 }
 /* 

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -207,11 +207,11 @@ public:
    /* judge whether the two matrices are in the same type and size */
    static
-    bool IsIdentical(const XTensor * a, const XTensor * b);
+    bool IsSameShaped(const XTensor * a, const XTensor * b);
    /* judge whether the three matrices are in the same type and size */
    static
-    bool IsIdentical(XTensor * a, XTensor * b, XTensor * c);
+    bool IsSameShaped(XTensor * a, XTensor * b, XTensor * c);
    /* set the size of each dimension */
    void SetDim(int * myDimSize);

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -486,9 +486,8 @@ quick sorting
            NOTE: this means that the items may not placed in a continuous memory space
 >> comp - the comparison function 
 */
-void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
+void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *))
 {
-    XMemCopy(dataB, -1, dataA, -1, num * width);
    char *lo, *hi;         // ends of sub-array currently sorting
    int *indexlo, *indexhi;
    char *mid;             // points to middle of subarray
@@ -507,8 +506,8 @@ void XQSort(void * dataA, void * dataB, void * index, int num, int width, int st
    stackptr = 0;
-    lo = (char*)dataB;
+    lo = (char*)data;
-    hi = (char*)dataB + realStride * (num - 1);
+    hi = (char*)data + realStride * (num - 1);
    indexlo = (int*)index;
    indexhi = index != NULL ? (int*)index + stride * (num - 1) : NULL;

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -53,7 +53,7 @@ extern void XSleep(int sleepTime);
 extern double GetClock();
 extern double GetClockSec();
-extern void XQSort(void * dataA, void * dataB, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
+extern void XQSort(void * data, void * index, int num, int width, int stride, int (*comp)(const void *, const void *));
 extern int CompXFloat(const void * a, const void * b);
 #ifdef USE_CUDA

--- a/source/tensor/core/arithmetic/Absolute.cpp
+++ b/source/tensor/core/arithmetic/Absolute.cpp
@@ -42,7 +42,7 @@ void _Absolute(const XTensor * a, XTensor * b)
 }
 #endif
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
    DTYPE * db = (DTYPE*)b->data;

--- a/source/tensor/core/arithmetic/Absolute.cu
+++ b/source/tensor/core/arithmetic/Absolute.cu
@@ -60,10 +60,9 @@ set each entry to its absolute value
 >> a - input tensor
 >> b - output tensor
 */
-extern "C"
 void _CudaAbsolute(const XTensor * a, XTensor * b)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];

--- a/source/tensor/core/arithmetic/Absolute.cuh
+++ b/source/tensor/core/arithmetic/Absolute.cuh
@@ -34,7 +34,6 @@ __global__
 void KernelAbsolute(__half * a, __half * b, int size);
 /* set each entry to its absolute value */
-extern "C"
 void _CudaAbsolute(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.cpp
@@ -55,9 +55,9 @@ void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsIdentical(aim, ai) ||
+        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsIdentical(bim, bi) ||
+            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsIdentical(cim, ci))
+            !XTensor::IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;

--- a/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
+++ b/source/tensor/core/arithmetic/MatrixMULBatchedCPU.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication in batch mode (CPU code) */
-extern "C"
 void _MatrixMULBatchedCPU(const XList * a, MATRIX_TRANS_TYPE transposedA, const XList * b, MATRIX_TRANS_TYPE transposedB, 
                          XList * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);

--- a/source/tensor/core/arithmetic/MatrixMul2D.cu
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cu
@@ -46,10 +46,10 @@ c = a * b * \alpha
 >> cRowSize - row size of matrix c
 >> alpha - the scaling factor
 */
-extern "C" __global__
+__global__
 void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
-    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
+                                    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
-    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
+                                    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

--- a/source/tensor/core/arithmetic/MatrixMul2D.cuh
+++ b/source/tensor/core/arithmetic/MatrixMul2D.cuh
@@ -32,17 +32,16 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 mutilication of a dense matrix with a sparse vector
 c = a * b * \alpha
 */
-extern "C" __global__
+__global__
 void KernelMatrixMulDenseMSparseMV2(DTYPE * a, MATRIX_TRANS_TYPE transposedA, int aColSize, int aRowSize,
-    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
+                                    void * b, MATRIX_TRANS_TYPE transposedB, int bNonZeroNum, int bColSize, int bRowSize,
-    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
+                                    DTYPE * c, int cColSize, int cRowSize, DTYPE alpha);
 /*
 matrix multiplication (for 2d tensors) (cuda version)
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired
 */
-extern "C"
 void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);

--- a/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DMultiTheading.h
@@ -30,7 +30,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 matrix multiplication for a block (x1,y1) - (x2,y2)
 where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
 */
-extern "C"
 void _MatrixMul2DMultiTheading(XList * args);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/arithmetic/MatrixMul2DParallel.h
+++ b/source/tensor/core/arithmetic/MatrixMul2DParallel.h
@@ -31,7 +31,6 @@ matrix multiplication (for 2d tensors) with multi-threading.
 c = trans(a) * trans(b) * alpha + c * beta
 where trans() return the transposed matrix if the flag is fired.
 */
-extern "C"
 void _MatrixMul2DParallel(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
                          XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);

--- a/source/tensor/core/arithmetic/MatrixMulBatched.cpp
+++ b/source/tensor/core/arithmetic/MatrixMulBatched.cpp
@@ -113,10 +113,10 @@ void _MatrixMulBatched(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
        _CudaBLASMatrixMULList(handle,
-							  aList, transposedA,
+							   aList, transposedA,
-                              bList, transposedB,
+                               bList, transposedB,
-                              cList, aList->count,
+                               cList, aList->count,
-                              alpha, beta);
+                               alpha, beta);
        BacktoCudaDev(a->devID, devIDBackup);
 #else

--- a/source/tensor/core/arithmetic/Multiply.cu
+++ b/source/tensor/core/arithmetic/Multiply.cu
@@ -34,7 +34,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i)
 >> c - result data array
 >> size - size of c
 */
-extern "C" __global__
+__global__
 void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -51,7 +51,7 @@ multiplication of data arrays in a element-wise manner c(i) = a(i)*b(i) + \alpha
 >> size - size of c
 >> alpha - the coefficient
 */
-extern "C" __global__
+__global__
 void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -120,7 +120,6 @@ where i is the item index
 >> alpha - the coefficient
 >> leadingDim - dimension along which we perform broadcasting
 */
-extern "C"
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha, int leadingDim)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;

--- a/source/tensor/core/arithmetic/Multiply.cuh
+++ b/source/tensor/core/arithmetic/Multiply.cuh
@@ -29,11 +29,11 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) */
-extern "C" __global__
+__global__
 void KernelMulElementWise(DTYPE * a, DTYPE * b, DTYPE * c, int size);
 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i) + \alpha*c(i) */
-extern "C" __global__
+__global__
 void KernelMulElementWiseV2(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE alpha);
 /* multiplication of two tensors in a element-wise manner c(i) = a(i)*b(i)+ \alpha*c(i)  */
@@ -41,7 +41,6 @@ template<int nonZeroAlpha>__global__
 void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE alpha, int stride, int ldSizeA, int ldSizeB, int ldSizeC, int blockNum);
 /* element-wise product of two tensors */
-extern "C"
 void _CudaMultiply(const XTensor * a, const XTensor * b, XTensor * c, DTYPE alpha = 0, int leadingDim = 0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Negate.cpp
+++ b/source/tensor/core/arithmetic/Negate.cpp
@@ -41,7 +41,7 @@ void _Negate(const XTensor * a, XTensor * b)
    }
 #endif
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
    DTYPE * db = (DTYPE*)b->data;

--- a/source/tensor/core/arithmetic/Negate.cu
+++ b/source/tensor/core/arithmetic/Negate.cu
@@ -68,10 +68,9 @@ set each entry to its negtive value
 >> a - input tensor
 >> b - output tensor
 */
-extern "C"
 void _CudaNegate(const XTensor * a, XTensor * b)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];

--- a/source/tensor/core/arithmetic/Negate.cuh
+++ b/source/tensor/core/arithmetic/Negate.cuh
@@ -37,7 +37,6 @@ __global__
 void KernelNegate(__half * a, __half * b, int size);
 /* set each entry to its negtive value */
-extern "C"
 void _CudaNegate(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Sign.cpp
+++ b/source/tensor/core/arithmetic/Sign.cpp
@@ -41,7 +41,7 @@ void _Sign(const XTensor * a, XTensor * b)
 }
 #endif
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
    DTYPE * db = (DTYPE*)b->data;

--- a/source/tensor/core/arithmetic/Sign.cu
+++ b/source/tensor/core/arithmetic/Sign.cu
@@ -66,10 +66,9 @@ set each entry to its sign value
 >> a - input tensor we are processing
 >> b - output tensor we are processing
 */
-extern "C"
 void _CudaSign(const XTensor * a, XTensor * b)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];

--- a/source/tensor/core/arithmetic/Sign.cuh
+++ b/source/tensor/core/arithmetic/Sign.cuh
@@ -37,7 +37,6 @@ __global__
 void KernelSign(__half * a, __half * b, int size);
 /* set each entry to its sign value */
-extern "C"
 void _CudaSign(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/Sum.cu
+++ b/source/tensor/core/arithmetic/Sum.cu
@@ -35,7 +35,7 @@ c = a  + b * \beta
 >> size - the size of a/b/c
 >> beta - the coefficient
 */
-extern "C" __global__
+__global__
 void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;

--- a/source/tensor/core/arithmetic/Sum.cuh
+++ b/source/tensor/core/arithmetic/Sum.cuh
@@ -29,15 +29,13 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* summation of data arrays (CUDA Kernel) */
-extern "C" __global__
+__global__
 void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
 /* tensor summation c = a + b * \beta (cuda version) */
-extern "C"
 void _CudaSum(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 /*  tensor summation c = a + b * \beta (cuda version) with an input handle */
-extern "C"
 void _CudaSumWithHandle(int devID, cublasHandle_t * handle, DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnTV.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cpp
@@ -40,9 +40,9 @@ where b is a vector.
 void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
+    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-        "Illegal input vector size!");
+                  "Illegal input vector size!");
    int rowNum = a->dimSize[0];
    int colNum = a->dimSize[1];

--- a/source/tensor/core/arithmetic/SumByColumnTV.cu
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cu
@@ -39,7 +39,7 @@ c_col = a_col  + b * \beta
 >> size - size of the entire data array
 >> beta - the scaling factor
 */
-extern "C" __global__
+__global__
 void KernelADDByColumnTV(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int blockSize, int size, DTYPE beta)
 {
    int i = blockDim.x * blockIdx.x + threadIdx.x;
@@ -67,11 +67,11 @@ where b is a vector.
 void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
+    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((b->order == 2 && b->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-        "Illegal input vector size!");
+                  "Illegal input vector size!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-        c->dataType == DEFAULT_DTYPE), "TODO");
+                  c->dataType == DEFAULT_DTYPE), "TODO");
    int rowNum = a->dimSize[0];
    int colNum = a->dimSize[1];

--- a/source/tensor/core/arithmetic/SumByColumnTV.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnTV.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* summation of a tensor and a vector (column vector) */
-extern "C"
 void _CudaSumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnTV.h
+++ b/source/tensor/core/arithmetic/SumByColumnTV.h
@@ -27,9 +27,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a tensor and a (column) vector */
-extern "C"
 void _SumByColumnTV(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUMBYCOLUMNTV_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/SumByColumnVT.cpp
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cpp
@@ -40,9 +40,9 @@ where c and a are vectors, and b_col is a column in b.
 void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
+    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-        "Illegal input vector size!");
+                  "Illegal input vector size!");
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnVT.cu
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cu
@@ -39,7 +39,7 @@ c = a + \sum{col} b_col * \beta
 >> size - size of the entire data array
 >> beta - the scaling factor
 */
-extern "C" __global__
+__global__
 void KernelADDByColumnVT(DTYPE * a, DTYPE * b, DTYPE * c, int colNum, int rowNum, int blockNum, DTYPE beta)
 {
    int row = blockDim.x * blockIdx.x + threadIdx.x;
@@ -83,11 +83,11 @@ where c and a are vectors, and b_col is a column in b.
 void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
-    CheckNTErrors((XTensor::IsIdentical(a, c)), "Unmatched tensors in addition!");
+    CheckNTErrors((XTensor::IsSameShaped(a, c)), "Unmatched tensors in addition!");
    CheckNTErrors((a->order == 2 && a->dimSizeRDI[0] == 1 && b->dimSizeRDI[1] == a->dimSizeRDI[1]),
-        "Illegal input vector size!");
+                  "Illegal input vector size!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE && b->dataType == DEFAULT_DTYPE &&
-        c->dataType == DEFAULT_DTYPE), "TODO");
+                  c->dataType == DEFAULT_DTYPE), "TODO");
    int rowNum = b->dimSize[0];
    int colNum = b->dimSize[1];

--- a/source/tensor/core/arithmetic/SumByColumnVT.cuh
+++ b/source/tensor/core/arithmetic/SumByColumnVT.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* summation of a vector (column vector) and a tensor */
-extern "C"
 void _CudaSumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
 #endif // USE_CUDA

--- a/source/tensor/core/arithmetic/SumByColumnVT.h
+++ b/source/tensor/core/arithmetic/SumByColumnVT.h
@@ -27,9 +27,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a (column) vector and a tensor */
-extern "C"
 void _SumByColumnVT(const XTensor * a, const XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SUMBYCOLUMNVT_H__
\ No newline at end of file
--- a/source/tensor/core/arithmetic/XTensorBLAS.cpp
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cpp
@@ -42,7 +42,7 @@ void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
 {
    CheckNTErrors((a && b && c), "Empty input tensors!");
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
-        "Input tensors must have a order = 2!");
+                  "Input tensors must have a order = 2!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    CheckNTErrors((b->dataType == DEFAULT_DTYPE), "TODO!");
    CheckNTErrors((c->dataType == DEFAULT_DTYPE), "TODO!");

--- a/source/tensor/core/arithmetic/XTensorBLAS.cu
+++ b/source/tensor/core/arithmetic/XTensorBLAS.cu
@@ -143,7 +143,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 }
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
-extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
@@ -225,9 +224,9 @@ void _CudaBLASMatrixMULList(cublasHandle_t * handle,
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);
        XTensor * ci = (XTensor*)c->GetItem(i);
-        if (!XTensor::IsIdentical(aim, ai) ||
+        if (!XTensor::IsSameShaped(aim, ai) ||
-            !XTensor::IsIdentical(bim, bi) ||
+            !XTensor::IsSameShaped(bim, bi) ||
-            !XTensor::IsIdentical(cim, ci))
+            !XTensor::IsSameShaped(cim, ci))
        {
            isUniform = false;
            break;

--- a/source/tensor/core/arithmetic/XTensorBLAS.h
+++ b/source/tensor/core/arithmetic/XTensorBLAS.h
@@ -27,14 +27,12 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* matrix multiplication (BLAS) */
-extern "C"
 void _MatrixMULCPU(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, 
                   XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
 #ifdef USE_CUDA
 /* matrix multiplication via cuda version BLAS */
-extern "C"
 void _CudaBLASMatrixMUL(cublasHandle_t * handle,
                        const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                        const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -42,7 +40,6 @@ void _CudaBLASMatrixMUL(cublasHandle_t * handle,
                        int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
-extern "C"
 void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                               const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                               const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -51,7 +48,6 @@ void _CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
-extern "C"
 void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                      const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
@@ -60,7 +56,6 @@ void _CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                      DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
 /* matrix multiplication in batch mode via cuda version BLAS */
-extern "C"
 void _CudaBLASMatrixMULList(cublasHandle_t * handle, const XList * a, MATRIX_TRANS_TYPE transposedA, 
                            const XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
                            int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

--- a/source/tensor/core/getandset/Select.cuh
+++ b/source/tensor/core/getandset/Select.cuh
@@ -27,14 +27,12 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* generate a tensor with selected data c = select(a) */
-extern "C"
 void _CudaSelect(const XTensor * a, XTensor * c, XTensor * indexCPU);
 /* 
 generate a tensor with selected data in range[low,high] along the given dimension 
 c = select(a)
 */
-extern "C" 
 void _CudaSelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/getandset/Select.h
+++ b/source/tensor/core/getandset/Select.h
@@ -27,7 +27,6 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* generate a tensor with selected data c = select(a) */
-extern "C"
 void _Select(const XTensor * a, XTensor * c, XTensor * indexCPU);
 /* 
@@ -40,7 +39,6 @@ XTensor Select(const XTensor &a, XTensor &indexCPU);
 generate a tensor with selected data in range[low,high] along the given dimension 
 c = select(a)
 */
-extern "C" 
 void _SelectRange(const XTensor * a, XTensor * c, int dim, int low, int high);
 /* 
@@ -52,4 +50,4 @@ XTensor SelectRange(const XTensor &a, int dim, int low, int high);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __SELECT_H__
\ No newline at end of file
--- a/source/tensor/core/math/Log.cpp
+++ b/source/tensor/core/math/Log.cpp
@@ -42,7 +42,7 @@ void _Log(const XTensor * a, XTensor * b)
    }
 #endif
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "TODO!");
    DTYPE * d = (DTYPE*)a->data;
    DTYPE * db = (DTYPE*)b->data;

--- a/source/tensor/core/math/Log.cu
+++ b/source/tensor/core/math/Log.cu
@@ -60,10 +60,9 @@ set each entry to its log value
 >> a - input tensor
 >> b - output tensor
 */
-extern "C"
 void _CudaLog(const XTensor * a, XTensor * b)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((a->isSparse == false), "TODO!");
    int gridSize[3];

--- a/source/tensor/core/math/Log.cuh
+++ b/source/tensor/core/math/Log.cuh
@@ -37,7 +37,6 @@ __global__
 void KernelLog(__half * a, __half * b, int size);
 /* set each entry to its log value */
-extern "C"
 void _CudaLog(const XTensor * a, XTensor * b);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Normalize.cpp
+++ b/source/tensor/core/math/Normalize.cpp
@@ -45,9 +45,9 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon)
 {
 	int dimRDI = input->order - dim - 1;
-    CheckNTErrors((XTensor::IsIdentical(input, output)), "Unmatched input tensors!");
+    CheckNTErrors((XTensor::IsSameShaped(input, output)), "Unmatched input tensors!");
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Unmatched input tensors");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Unmatched input tensors");
-    CheckNTErrors((XTensor::IsIdentical(mean, var)), "Unmatched input tensors");
+    CheckNTErrors((XTensor::IsSameShaped(mean, var)), "Unmatched input tensors");
    CheckNTErrors((input && output && mean && var && a && b), "Empty input tensors!");
    CheckNTErrors((dimRDI >= 0 && dimRDI < input->order), "Incorrect reduction dimension!");
    CheckNTErrors((dimRDI == a->order - 1), "Incorrect reduction dimension!");

--- a/source/tensor/core/math/Normalize.cu
+++ b/source/tensor/core/math/Normalize.cu
@@ -44,8 +44,8 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 */
 __global__
 void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
-    DTYPE * a, DTYPE * b, DTYPE epsilon,
+                     DTYPE * a, DTYPE * b, DTYPE epsilon,
-    int stride, int strideNum, int blockNum)
+                     int stride, int strideNum, int blockNum)
 {
    __shared__ DTYPE iMean[MAX_CUDA_THREAD_NUM_PER_BLOCK];
    __shared__ DTYPE iVar[MAX_CUDA_THREAD_NUM_PER_BLOCK];
@@ -88,11 +88,10 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 >> b - the bias
 >> epsilon - a parameter
 */
-extern "C"
 void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
-                   const XTensor * mean, const XTensor * var,
+                    const XTensor * mean, const XTensor * var,
-                   const XTensor * a, const XTensor * b,
+                    const XTensor * a, const XTensor * b,
-                   DTYPE epsilon)
+                    DTYPE epsilon)
 {
    CheckNTErrors((input->dataType == DEFAULT_DTYPE), "TODO!");

--- a/source/tensor/core/math/Normalize.cuh
+++ b/source/tensor/core/math/Normalize.cuh
@@ -35,18 +35,17 @@ where a and b are the scalar and bias respectively, and \epsilon is the adjustme
 */
 __global__
 void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
-    DTYPE * a, DTYPE * b, DTYPE epsilon,
+                     DTYPE * a, DTYPE * b, DTYPE epsilon,
-    int stride, int strideNum, int blockNum);
+                     int stride, int strideNum, int blockNum);
 /* 
 normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
-extern "C"
 void _CudaNormalize(const XTensor * input, XTensor * output, int dim,
-    const XTensor * mean, const XTensor * var,
+                    const XTensor * mean, const XTensor * var,
-    const XTensor * a, const XTensor * b, DTYPE epsilon);
+                    const XTensor * a, const XTensor * b, DTYPE epsilon);
 #endif // USE_CUDA

--- a/source/tensor/core/math/Normalize.h
+++ b/source/tensor/core/math/Normalize.h
@@ -31,7 +31,6 @@ normalized the data with normal distribution.
 For an input x, y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-extern "C"
 void _Normalize(const XTensor * input, XTensor * output, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
 /*
@@ -40,7 +39,6 @@ keep the result in the input tenosr and return nothing
 For an input x, x = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
 */
-extern "C"
 void _NormalizeMe(XTensor * input, int dim, const XTensor * mean, const XTensor * var, const XTensor * a, const XTensor * b, DTYPE epsilon);
 /*

--- a/source/tensor/core/math/Power.cu
+++ b/source/tensor/core/math/Power.cu
@@ -100,10 +100,9 @@ void KernelPower(__half * a, __half * b, __half p, int size)
 }
 /* get the power of the entries */
-extern "C"
 void _CudaPower(const XTensor * a, XTensor * b, DTYPE p)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    int gridSize[3];
    int blockSize[3];

--- a/source/tensor/core/math/Power.cuh
+++ b/source/tensor/core/math/Power.cuh
@@ -37,7 +37,6 @@ __global__
 void KernelSqrtV2(__half * a, __half * b, int size);
 /* get the power of the entries */
-extern "C"
 void _CudaPower(const XTensor * a, XTensor * b, DTYPE p);
 #endif // USE_CUDA

--- a/source/tensor/core/math/ScaleAndShift.cpp
+++ b/source/tensor/core/math/ScaleAndShift.cpp
@@ -47,8 +47,7 @@ void _ScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift)
    }
 #endif
-    CheckNTErrors((a->dataType == DEFAULT_DTYPE),
+    CheckNTErrors((a->dataType == DEFAULT_DTYPE), "The tensor is not in the default data type!");
-                        "The tensor is not in the default data type!");
    /* sparse tensor */
    if(a->isSparse){

--- a/source/tensor/core/math/ScaleAndShift.cuh
+++ b/source/tensor/core/math/ScaleAndShift.cuh
@@ -37,7 +37,6 @@ __global__
 void KernelScaleAndShift(__half * a, __half * b, int size, __half scale, __half shift);
 /* scale and shift all tensor entires b = a * scale + shift (cuda version) */
-extern "C" 
 void _CudaScaleAndShift(const XTensor * a, XTensor * b, DTYPE scale, DTYPE shift);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocks.cpp
+++ b/source/tensor/core/movement/CopyBlocks.cpp
@@ -86,7 +86,7 @@ void _CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum,
        */
        for (int i = 0; i < blockNum; i++) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
-                (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
+                     (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);
        }
    }
 }

--- a/source/tensor/core/movement/CopyBlocksInGrid.cpp
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cpp
@@ -39,7 +39,7 @@ Note that a grid may have a number of blocks
 >> isIndexOnDev - indicates whether the index is on the device already
 */
 void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target,
-                      int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
+                       int * index, int unitSize, bool isIndexOnDev, XMem * myMem)
 {
    CheckNTErrors((unitSize == sizeof(int)), "TODO!");

--- a/source/tensor/core/movement/CopyBlocksInGrid.cuh
+++ b/source/tensor/core/movement/CopyBlocksInGrid.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* copy data by index */
-extern "C"
 void _CudaCopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksInGrid.h
+++ b/source/tensor/core/movement/CopyBlocksInGrid.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks in grid */
-extern "C"
 void _CopyBlocksInGrid(void * source, int blockSize, int blockNum, int gridNum, void * target, int * index, int unitSize, bool isIndexOnDev, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocksOnSite.cuh
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cuh
@@ -33,7 +33,6 @@ __global__
 void KernelCopyBlocks(DTYPE * source, int blockSize, int blockNum, DTYPE * target, int * targetBlocks);
 /* copy a number of blocks to target positions (cuda version) */
-extern "C"
 void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyBlocksOnSite.h
+++ b/source/tensor/core/movement/CopyBlocksOnSite.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks to target positions (on site) */
-extern "C"
 void _CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target, int * targetBlocks, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyBlocksSelected.cu
+++ b/source/tensor/core/movement/CopyBlocksSelected.cu
@@ -72,7 +72,7 @@ copy a number of blocks from source positions to target positions (cuda version)
 */
 void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID)
 {
-    CheckNTErrors((devID >= 0), "Wrong device to run!");
+    CheckNTErrors(devID >= 0, "Wrong device to run!");
    CheckNTErrors((blockSize % sizeof(DTYPE) == 0), "Unsupported block size!");
    /* copy the index to the GPU memory */

--- a/source/tensor/core/movement/CopyBlocksSelected.cuh
+++ b/source/tensor/core/movement/CopyBlocksSelected.cuh
@@ -33,7 +33,6 @@ __global__
 void KernelCopyBlocksSelected(DTYPE * source, int blockSize, int * sourceBlocks, int blockNum, DTYPE * target, int * targetBlocks);
 /* copy a number of blocks form source positions to target positions (cuda version) */
-extern "C"
 void _CudaCopyBlocksSelected(void * source, int blockSize, int * sourceBlocks, int blockNum, void * target, int * targetBlocks, XMem * myMem, int devID);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyData2D.h
+++ b/source/tensor/core/movement/CopyData2D.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy data blocks by 2d layout */
-extern "C"
 void _CopyData2D(void ** s, int sPitch, void ** t, int tPitch, int count, int mSize, int n, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/movement/CopyInGrid.cpp
+++ b/source/tensor/core/movement/CopyInGrid.cpp
@@ -38,7 +38,7 @@ in the k-th grid
 */
 void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {
-    CheckNTErrors((XTensor::IsIdentical(s, t)), "Unmatched tensors!");
+    CheckNTErrors((XTensor::IsSameShaped(s, t)), "Unmatched tensors!");
    int blockDimRDI = s->order - blockDim - 1;
    int blockSize = 1;

--- a/source/tensor/core/movement/CopyInGrid.h
+++ b/source/tensor/core/movement/CopyInGrid.h
@@ -27,9 +27,8 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy a number of blocks in grid. i.e., reorder the data blocks in the same memory piece*/
-extern "C"
 void _CopyInGrid(const XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __COPYINGRID_H__
\ No newline at end of file
--- a/source/tensor/core/movement/CopyIndexed.cpp
+++ b/source/tensor/core/movement/CopyIndexed.cpp
@@ -44,7 +44,7 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
-        "the data must be kept on the same device!");
+                  "the data must be kept on the same device!");
    CheckNTErrors((dim < s->order && dim < t->order), "A too larget dimension specified!");
    CheckNTErrors((s->unitSize == t->unitSize), "Unmatched tensors!");

--- a/source/tensor/core/movement/CopyIndexed.h
+++ b/source/tensor/core/movement/CopyIndexed.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy selected sub-tensors */
-extern "C"
 void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
 /* 

--- a/source/tensor/core/movement/CopyValues.cuh
+++ b/source/tensor/core/movement/CopyValues.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* copy all elements from a source matrix to a target matrix */
-extern "C"
 void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
 #endif // USE_CUDA

--- a/source/tensor/core/movement/CopyValues.h
+++ b/source/tensor/core/movement/CopyValues.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* copy s to t */
-extern "C"
 void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
 /* 
@@ -38,4 +37,4 @@ XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __COPYVALUES_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMax.cu
+++ b/source/tensor/core/reduce/ReduceMax.cu
@@ -101,8 +101,8 @@ crossing of the i-th columne and the j-th row.
 */
 __global__
 void KernelReduceMax(__half * input, __half * output,
-        int stride, int strideNum, int reducedStrideNum,
+                     int stride, int strideNum, int reducedStrideNum,
-        int blockSize, int blockNum)
+                     int blockSize, int blockNum)
 {
    int idx = threadIdx.x * blockDim.y + threadIdx.y;
    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
@@ -224,8 +224,8 @@ reduce a tensor to another that keeps the max value along a dimension  - fast ve
 */
 template <unsigned int goodSize> __global__
 void KernelReduceMaxFast(__half * input, __half * output,
-    int stride, int strideNum, int reducedStrideNum,
+                         int stride, int strideNum, int reducedStrideNum,
-    int blockSize, int blockNum)
+                         int blockSize, int blockNum)
 {
    unsigned int tid = threadIdx.y;
    unsigned int j = blockIdx.y * (blockDim.y * 2) + threadIdx.y;

--- a/source/tensor/core/reduce/ReduceMax.cuh
+++ b/source/tensor/core/reduce/ReduceMax.cuh
@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* get the max-valued items along a dimension of the tensor (cuda version) */
-extern "C" 
 void _CudaReduceMax(const XTensor * input, XTensor * output, int dim);
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceMax.h
+++ b/source/tensor/core/reduce/ReduceMax.h
@@ -27,7 +27,6 @@
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* get the max value of the items along a dimension of the tensor. */
-extern "C"
 void _ReduceMax(const XTensor * input, XTensor * output, int dim);
 /* 
@@ -38,4 +37,4 @@ XTensor ReduceMax(const XTensor &input, int dim);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __REDUCEMAX_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceMean.h
+++ b/source/tensor/core/reduce/ReduceMean.h
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 get the mean value along a dimension of the tensor
 For a 1-dimensional data array a, mean = (1/n) * sum_i input_i
 */
-extern "C"
 void _ReduceMean(const XTensor * input, XTensor * output, int dim);
 /* 
@@ -42,4 +41,4 @@ XTensor ReduceMean(const XTensor &input, int dim);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __REDUCEMEAN_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceStandardVariance.h
+++ b/source/tensor/core/reduce/ReduceStandardVariance.h
@@ -31,7 +31,6 @@ standard variance of the items along a dimension of the tensor
 For a 1-dimensional data array a,
 variance = (1/n * \sum_i (a_i - mean)^2)^0.5
 */
-extern "C"
 void _ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/reduce/ReduceSum.cpp
+++ b/source/tensor/core/reduce/ReduceSum.cpp
@@ -43,22 +43,20 @@ sum = \sum_i exp((a_i - shift)^power) if isExp == true
 void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp)
 {
    CheckNTErrors((input->devID == output->devID || (input->devID < 0 && output->devID < 0)), 
-                        "This code must be run on the same device!");
+                  "This code must be run on the same device!");
    CheckNTErrors((input && output), "Empty input or output tensors!");
    CheckNTErrors((input->order == output->order + 1), "Incorrect tensor sizes!");
    CheckNTErrors((input->order > dim && dim >=0), "Illegal dimension to reduce!");
    CheckNTErrors((input->dataType == output->dataType), "Unmatched data types!");
-    CheckNTErrors((shift == NULL || XTensor::IsIdentical(output, shift)), "Incorrect shift tensor size!");
+    CheckNTErrors((shift == NULL || XTensor::IsSameShaped(output, shift)), "Incorrect shift tensor size!");
 	int dimRDI = input->order - dim - 1;
    for(int i = 0; i < input->order; i++){
        if(i < dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), 
+            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i]), "Unmatched tensors!");
-                                 "Unmatched tensors!");
        }
        else if(i > dimRDI){
-            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), 
+            CheckNTErrors((input->dimSizeRDI[i] == output->dimSizeRDI[i - 1]), "Unmatched tensors!");
-                                "Unmatched tensors!");
        }
    }

--- a/source/tensor/core/reduce/ReduceSum.cuh
+++ b/source/tensor/core/reduce/ReduceSum.cuh
@@ -34,7 +34,6 @@ For a 1-dimensional data array a,
 sum = \sum_i ((a_i + shift)^power) if isExp == false
 sum = \sum_i exp((a_i + shift)^power) if isExp == true
 */
-extern "C" 
 void _CudaReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift, DTYPE power, bool isExp);
 #endif // USE_CUDA

--- a/source/tensor/core/reduce/ReduceSum.h
+++ b/source/tensor/core/reduce/ReduceSum.h
@@ -32,7 +32,6 @@ For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift) if isExp == false
 sum = \sum_i exp(a_i - shift) if isExp == true
 */
-extern "C"
 void _ReduceSum(const XTensor * input, XTensor * output, int dim, const XTensor * shift = NULL,
    DTYPE power = (DTYPE)1.0F, bool isExp = false);
@@ -56,4 +55,4 @@ XTensor ReduceSum(const XTensor &input, int dim, DTYPE power = (DTYPE)1.0F, bool
 } // namespace nts(NiuTrans.Tensor)
 #endif // __REDUCESUM_H__
\ No newline at end of file
--- a/source/tensor/core/reduce/ReduceSumSquared.h
+++ b/source/tensor/core/reduce/ReduceSumSquared.h
@@ -31,7 +31,6 @@ squared sum of the items along a dimension of the tensor
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 */
-extern "C"
 void _ReduceSumSquared(const XTensor * input, XTensor * output, int dim, const XTensor * shift);
 /* 

--- a/source/tensor/core/reduce/ReduceVariance.h
+++ b/source/tensor/core/reduce/ReduceVariance.h
@@ -30,7 +30,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 variance of the items along a dimension of the tensor
 For a 1-dimensional data array a, variance = 1/n * \sum_i (a_i - mean)^2
 */
-extern "C"
 void _ReduceVariance(const XTensor * input, XTensor * output, int dim, const XTensor * mean);
 /* 

--- a/source/tensor/core/shape/Concatenate.cpp
+++ b/source/tensor/core/shape/Concatenate.cpp
@@ -44,7 +44,7 @@ void _Concatenate(const XList * smalls, XTensor * big, int dim)
        XTensor * a = (XTensor*)smalls->GetItem(i - 1);
        XTensor * b = (XTensor*)smalls->GetItem(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsIdentical(a, b))
+        if (!XTensor::IsSameShaped(a, b))
            uniform = false;
    }
@@ -76,7 +76,7 @@ XTensor Concatenate(const XList &smalls, int dim)
        XTensor * a = (XTensor*)smalls.GetItem(i - 1);
        XTensor * b = (XTensor*)smalls.GetItem(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsIdentical(a, b))
+        if (!XTensor::IsSameShaped(a, b))
            uniform = false;
    }
    XTensor * tensor = (XTensor*)smalls.GetItem(0);
@@ -177,7 +177,7 @@ XTensor Concatenate(const XTensor &smallA, const XTensor &smallB, int dim)
        XTensor * a = (XTensor*)smalls.Get(i - 1);
        XTensor * b = (XTensor*)smalls.Get(i);
        CheckNTErrors((a && b), "Empty input tensors!");
-        if (!XTensor::IsIdentical(a, b))
+        if (!XTensor::IsSameShaped(a, b))
            uniform = false;
    }
    XTensor * tensor = (XTensor*)smalls.Get(0);

--- a/source/tensor/core/shape/ConcatenateSolely.h
+++ b/source/tensor/core/shape/ConcatenateSolely.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* concatenate a list of tensors along a given dimension */
-extern "C"
 void _ConcatenateSolely(const XList * smalls, XTensor * big, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cu
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cu
@@ -69,7 +69,6 @@ set target data block index for the data movement in split
 >> gridNum - number of grids
 >> mem - the memory pool
 */
-extern "C"
 void _CudaMakeMergeBlockIndex(int devID,
                              int * blockIndex, int blockNum, int blockNumInMerge,
                              int splitSizeInGrid, int gridSize, int gridNum)

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* set target data block index for the data movement in split */
-extern "C"
 void _CudaMakeMergeBlockIndex(int devID, int * blockIndex, int blockNum, int blockNumInMerge,
                              int splitSizeInGrid, int gridSize, int gridNum);

--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set target data block index for the data movement in merge */
-extern "C"
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cu
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cu
@@ -57,7 +57,6 @@ set target data block index for the data movement in split
 >> blockSplitSize - size of the splitted block
 >> blockNum - number of data blocks
 */
-extern "C"
 void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum)
 {
    int cudaGrids[3];

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cuh
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* set target data block index for the data movement in split */
-extern "C"
 void _CudaMakeSplitBlockIndex(int devID, int * blockIndex, int splitNum, int blockSplitSize, int blockNum);
 #endif // USE_CUDA

--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* set target data block index for the data movement in split */
-extern "C"
 void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -49,7 +49,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
-        "the data must be kept on the same device!");
+                  "the data must be kept on the same device!");
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order + 1), "Unmatched tensors!");
@@ -58,11 +58,11 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
    for (int i = 0; i < s->order; i++) {
        if (i == whereToMergeRDI) {
            CheckNTErrors((t->dimSizeRDI[i] == s->dimSizeRDI[i] * s->dimSizeRDI[leadingDimRDI]),
-                "Unmatched tensor sizes!");
+                          "Unmatched tensor sizes!");
        }
        else if (i > leadingDimRDI) {
            CheckNTErrors((s->dimSizeRDI[i - 1] == t->dimSizeRDI[i]),
-                "Unmatched tensor sizes!");
+                          "Unmatched tensor sizes!");
        }
    }
@@ -99,8 +99,7 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
            char * sData = (char*)s->data + g * blockSize * blockNum * s->unitSize;
            for (int k = 0; k < mergedNum; k++) {
                XMemCopy2D(tData + k * tStep, tPtich, t->devID,
-                    sData + k * sStep, sPitch, s->devID,
+                           sData + k * sStep, sPitch, s->devID, mSize, n);
-                    mSize, n);
            }
        }
    }
@@ -356,7 +355,7 @@ merge two tensors into a big tensor (return a XTensor structure)
 */
 XTensor Merge(const XTensor &smallA, const XTensor &smallB, int whereToMerge)
 {
-    CheckNTErrors(XTensor::IsIdentical(&smallA, &smallB), 
+    CheckNTErrors(XTensor::IsSameShaped(&smallA, &smallB), 
                 "The two tensors must be of the same size!");
    int order = smallA.order;

--- a/source/tensor/core/shape/MergeBlockLists.cu
+++ b/source/tensor/core/shape/MergeBlockLists.cu
@@ -71,7 +71,6 @@ merge data by blocks (cuda version)
 >> target - target data array
 >> myMem - the memory pool
 */
-extern "C"
 void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem)
 {
    CheckNTErrors((myMem != NULL), "No memory pool!");

--- a/source/tensor/core/shape/MergeBlockLists.cuh
+++ b/source/tensor/core/shape/MergeBlockLists.cuh
@@ -33,7 +33,6 @@ __global__
 void KernelCopyBlockLists(DTYPE ** sourceList, int * sourceBlockSizes, int sourceBlockNum, DTYPE ** targetList);
 /* merge data by blocks (cuda version) */
-extern "C"
 void _CudaMergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
 #endif // USE_CUDA

--- a/source/tensor/core/shape/MergeBlockLists.h
+++ b/source/tensor/core/shape/MergeBlockLists.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* merge data by blocks */
-extern "C"
 void _MergeBlockLists(const XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -41,7 +41,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
 {
    CheckNTErrors((s && t), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
-        "the data must be kept on the same device!");
+                  "the data must be kept on the same device!");
    CheckNTErrors((s->unitNum == t->unitNum && s->unitSize == t->unitSize), "Unmatched tensors!");
    CheckNTErrors((s->order == t->order - 1), "Unmatched tensors!");
@@ -51,11 +51,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    for (int i = 0; i < s->order; i++) {
        if (i == whereToSplitRDI) {
            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i] * splitNum),
-                "Unmatched tensor sizes!");
+                          "Unmatched tensor sizes!");
        }
        else {
            CheckNTErrors((s->dimSizeRDI[i] == t->dimSizeRDI[i]),
-                "Unmatched tensor sizes!");
+                          "Unmatched tensor sizes!");
        }
    }
@@ -301,7 +301,7 @@ void Split(const XTensor &big, XList &smalls, int whereToSplit, int splitNum)
        XLink::AddParamToHeadInt(s, whereToSplit);
        /* it is tricky here that we keep the id of each 
-           block, rather than the total number of splits */
+           block, rather than the total number of the splits */
        XLink::AddParamToHeadInt(s, i);
    }
 }

--- a/source/tensor/core/shape/Unsqueeze.cu
+++ b/source/tensor/core/shape/Unsqueeze.cu
@@ -66,7 +66,6 @@ insert a dimension by copying the blocks for x times (where x is the size of the
 >> dim - where to insert the dimension
 >> dSize - size of the newly-inserted dimension
 */
-extern "C"
 void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
 {
    int blockSize = 1;

--- a/source/tensor/core/shape/Unsqueeze.cuh
+++ b/source/tensor/core/shape/Unsqueeze.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* duplicate the data along a given dimension */
-extern "C"
 void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
 #endif // USE_CUDA

--- a/source/tensor/core/shape/Unsqueeze.h
+++ b/source/tensor/core/shape/Unsqueeze.h
@@ -27,7 +27,6 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
-extern "C"
 void _Unsqueeze(const XTensor * a, XTensor * b, int dim, int dSize);
 /*
@@ -39,4 +38,4 @@ XTensor Unsqueeze(const XTensor &a, int dim, int dSize);
 } // namespace nts(NiuTrans.Tensor)
 #endif // __UNSQUEEZE_H__
\ No newline at end of file
--- a/source/tensor/core/sort/Sort.cpp
+++ b/source/tensor/core/sort/Sort.cpp
@@ -20,6 +20,7 @@
 */
 #include "../../XTensor.h"
+#include "../movement/CopyValues.h"
 #include "../../XUtility.h"
 #include "../../XName.h"
 #include "Sort.h"
@@ -36,7 +37,7 @@ sort the tensor along a given dimension
 */
 void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
 {
-    CheckNTErrors((XTensor::IsIdentical(a, b)), "Input tensors should have the same type!");
+    CheckNTErrors((XTensor::IsSameShaped(a, b)), "Input tensors should have the same type!");
    CheckNTErrors((dim >= 0 && dim < a->order), "Incorrect dimension specified!");
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
@@ -63,15 +64,15 @@ void _Sort(const XTensor * a, XTensor * b, XTensor * index, int dim)
            blockNum *= a->dimSizeRDI[i];
        int blockSize = stride * strideNum;
+		_CopyValues(a, b);
        for (int k = 0; k < blockNum; k++) {
        for (int i = 0; i < stride; i++) {
-                void * dataA = (char*)a->data + (k * blockSize + i) * a->unitSize;
                void * dataB = (char*)b->data + (k * blockSize + i) * b->unitSize;
                void * indexData = (char*)index->data + (k * blockSize + i) * sizeof(int);
                /* we sort the data array along "dim" */
                if (a->dataType == X_FLOAT)
-                    XQSort(dataA, dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
+                    XQSort(dataB, indexData, strideNum, a->unitSize, stride, CompXFloat);
                else {
                    ShowNTErrors("TODO!");
                }

--- a/source/tensor/core/sort/Sort.cuh
+++ b/source/tensor/core/sort/Sort.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* sort the tensor along a given dimension */
-extern "C"
 void _CudaSortBig(const XTensor * a, XTensor * b, XTensor * indexA, XTensor * indexB, int dim, int k = -1);
 #endif // USE_CUDA

--- a/source/tensor/core/sort/Sort.h
+++ b/source/tensor/core/sort/Sort.h
@@ -39,7 +39,6 @@ void _SortMe(XTensor * a, XTensor * index, int dim);
 sort the data along a given dimension (return a XTensor structure)
 make a new tensor to keep the result and return it
 */
-extern "C"
 void Sort(XTensor & a, XTensor & b, XTensor & index, int dim);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/tensor/core/sort/TopK.cuh
+++ b/source/tensor/core/sort/TopK.cuh
@@ -29,7 +29,6 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /* get the top-k items along a given dimension */
-extern "C"
 void _CudaTopK(const XTensor * a, XTensor * b, XTensor * index, int dim, int k);
 #endif // USE_CUDA

--- a/source/tensor/core/sort/TopK.h
+++ b/source/tensor/core/sort/TopK.h
--- a/source/tensor/core/utilities/SetAscendingOrder.cu
+++ b/source/tensor/core/utilities/SetAscendingOrder.cu
--- a/source/tensor/core/utilities/SetAscendingOrder.cuh
+++ b/source/tensor/core/utilities/SetAscendingOrder.cuh
--- a/source/tensor/core/utilities/XMatrixSegment.h
+++ b/source/tensor/core/utilities/XMatrixSegment.h
--- a/source/tensor/function/HardTanH.cpp
+++ b/source/tensor/function/HardTanH.cpp
--- a/source/tensor/function/HardTanH.cuh
+++ b/source/tensor/function/HardTanH.cuh
--- a/source/tensor/function/Identity.cpp
+++ b/source/tensor/function/Identity.cpp
--- a/source/tensor/function/LogSoftmax.cpp
+++ b/source/tensor/function/LogSoftmax.cpp
--- a/source/tensor/function/LogSoftmax.cu
+++ b/source/tensor/function/LogSoftmax.cu
--- a/source/tensor/function/Loss.cpp
+++ b/source/tensor/function/Loss.cpp
--- a/source/tensor/function/Loss.cu
+++ b/source/tensor/function/Loss.cu
--- a/source/tensor/function/Loss.cuh
+++ b/source/tensor/function/Loss.cuh
--- a/source/tensor/function/Loss.h
+++ b/source/tensor/function/Loss.h
--- a/source/tensor/function/Rectify.cpp
+++ b/source/tensor/function/Rectify.cpp
--- a/source/tensor/function/Rectify.cuh
+++ b/source/tensor/function/Rectify.cuh
--- a/source/tensor/function/Sigmoid.cpp
+++ b/source/tensor/function/Sigmoid.cpp
--- a/source/tensor/function/Sigmoid.cuh
+++ b/source/tensor/function/Sigmoid.cuh
--- a/source/tensor/function/Softmax.cpp
+++ b/source/tensor/function/Softmax.cpp
--- a/source/tensor/function/Softmax.cu
+++ b/source/tensor/function/Softmax.cu
--- a/source/tensor/function/Softmax.cuh
+++ b/source/tensor/function/Softmax.cuh
--- a/source/tensor/test/TAbsolute.h
+++ b/source/tensor/test/TAbsolute.h
--- a/source/tensor/test/TConcatenate.h
+++ b/source/tensor/test/TConcatenate.h
--- a/source/tensor/test/TConcatenateSolely.h
+++ b/source/tensor/test/TConcatenateSolely.h
--- a/source/tensor/test/TConvertDataType.h
+++ b/source/tensor/test/TConvertDataType.h
--- a/source/tensor/test/TCopyIndexed.h
+++ b/source/tensor/test/TCopyIndexed.h
--- a/source/tensor/test/TCopyValues.h
+++ b/source/tensor/test/TCopyValues.h
--- a/source/tensor/test/THardTanH.h
+++ b/source/tensor/test/THardTanH.h
--- a/source/tensor/test/TIdentity.h
+++ b/source/tensor/test/TIdentity.h
--- a/source/tensor/test/TLog.h
+++ b/source/tensor/test/TLog.h
--- a/source/tensor/test/TLogSoftmax.h
+++ b/source/tensor/test/TLogSoftmax.h
--- a/source/tensor/test/TLoss.h
+++ b/source/tensor/test/TLoss.h
--- a/source/tensor/test/TMatrixMul.h
+++ b/source/tensor/test/TMatrixMul.h
--- a/source/tensor/test/TMatrixMul2D.cpp
+++ b/source/tensor/test/TMatrixMul2D.cpp
--- a/source/tensor/test/TMatrixMul2D.h
+++ b/source/tensor/test/TMatrixMul2D.h
--- a/source/tensor/test/TMatrixMul2DParallel.h
+++ b/source/tensor/test/TMatrixMul2DParallel.h
--- a/source/tensor/test/TSort.cpp
+++ b/source/tensor/test/TSort.cpp
--- a/source/tensor/test/Test.h
+++ b/source/tensor/test/Test.h