1. update with master; 2. class core src

414ff54f · liyinqiao · 0887fae1 · 414ff54f · 414ff54f · 414ff54f
Commit 414ff54f authored Jul 08, 2018 by liyinqiao
--- a/source/XLink.cpp
+++ b/source/XLink.cpp
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-04
+ */
+#include <stdio.h>
+#include "XLink.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+int XLink::paramSize = 64;
+/* constuctor */
+XLink::XLink()
+{
+    head   = NULL;
+    tails  = NULL;
+    params = NULL;
+    tailNum  = 0;
+    paramNum = 0;
+    type[0] = 0;
+}
+/* deconstructor */
+XLink::~XLink()
+{
+    delete[] tails;
+    delete[] (char*)params;
+}
+/* reset it */
+void XLink::Reset()
+{
+    delete[] tails;
+    delete[] (char*)params;
+    head   = NULL;
+    tails  = NULL;
+    params = NULL;
+    tailNum  = 0;
+    paramNum = 0;
+    type[0] = 0;
+}
+/* 
+set edge type name 
+>> typeName - type name in string
+*/
+void XLink::SetType(const char * typeName)
+{
+    type[0] = 0;
+    if(typeName == NULL)
+        return;
+    strcpy(type, typeName);
+}
+/* 
+set head 
+>> h - pointer to the head tensor
+*/
+void XLink::SetHead(XTensor * h)
+{
+    head = h;
+}
+/* 
+add a tail
+>> t - pointer to the tail tensor
+*/
+void XLink::AddTail(XTensor * t)
+{
+    XTensor ** ts = tails;
+    tails = new XTensor*[tailNum + 1];
+    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
+    tails[tailNum++] = t;
+    delete[] ts;
+}
+/* 
+add two tails in one time 
+>> t1 - pointer to the tail tensor
+>> t2 - pointer to another tail tensor
+*/
+void XLink::AddTwoTails(XTensor * t1, XTensor * t2)
+{
+    XTensor ** ts = tails;
+    tails = new XTensor*[tailNum + 2];
+    memcpy(tails, ts, sizeof(XTensor*) * tailNum);
+    tails[tailNum++] = t1;
+    tails[tailNum++] = t2;
+    delete[] ts;
+}
+/* 
+add a parameter 
+>> param - parameter in default type
+*/
+void XLink::AddParam(DTYPE param)
+{
+    void * ps = params;
+    params = new char[paramNum + 1];
+    memcpy(params, ps, paramNum * paramSize);
+    DTYPE * p = (DTYPE*)((char*)params + paramNum * paramSize);
+    *p = param;
+    paramNum++;
+    delete[] (char*)ps;
+}
+/* 
+add a parameter 
+>> param - pointer to the parameter
+>> size - size of the parameter
+*/
+void XLink::AddParam(void * param, int size)
+{
+    void * ps = params;
+    params = new char[paramNum + 1];
+    memcpy(params, ps, paramNum * paramSize);
+    char * p = (char*)params + paramNum * paramSize;
+    memcpy(p, param, size);
+    paramNum++;
+    delete[] (char*)ps;
+}
+/* 
+create a hyperedge with two input tensors and a output tensor 
+>> t1 - a tail tensor
+>> t2 - another tail tensor
+>> h - head tensor
+>> typeName - name of edge type
+*/
+void XLink::MakeLink(XTensor * t1, XTensor * t2, XTensor * h, const char * typeName)
+{
+    if(h != NULL)
+        return;
+    /* forward */
+    XLink &income = h->income;
+    income.Reset();
+    income.SetHead(h);
+    if(t1 != NULL && t2 != NULL)
+        income.AddTwoTails(t1, t2);
+    else if(t1 != NULL)
+        income.AddTail(t1);
+    else{
+        ShowNTErrors("TODO!");
+    }
+    income.SetType(typeName);
+    /* backward for t1 */
+    if(t1 != NULL){
+        XLink &outgo = t1->outgo;
+        CheckNTErrors(outgo.head != t1, "Wrong head of the hyperedge!");
+        outgo.AddTail(h);
+    }
+    /* backward for t2 */
+    if(t2 != NULL){
+        XLink &outgo = t2->outgo;
+        CheckNTErrors(outgo.head != t2, "Wrong head of the hyperedge!");
+        outgo.AddTail(h);
+    }
+}
+/* 
+create a hyper edge with a list of tensors and a output tensor 
+>> list - a list of input tensors
+>> h - head tensor
+>> typeName - name of edge type
+*/
+void XLink::MakeLink(XList * list, XTensor * h, const char * typeName)
+{
+    /* forward */
+    XLink &income = h->income;
+    income.Reset();
+    income.SetHead(h);
+    income.SetType(typeName);
+    for(int i = 0; i < list->count; i++){
+        XTensor * t = (XTensor*)list->GetItem(i);
+        income.AddTail(t);
+    }
+    /* backward */
+    for(int i = 0; i < list->count; i++){
+        XTensor * t = (XTensor*)list->GetItem(i);
+        XLink &outgo = t->outgo;
+        CheckNTErrors(outgo.head != t, "Wrong head of the hyperedge!");
+        outgo.AddTail(h);
+    }
+}
+/* 
+add parameters 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHead(XTensor * h, DTYPE param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(param);
+}
+/* 
+add an integer parameter 
+>> h - head
+>> param - parameter we want introduce
+*/
+void XLink::AddParamToHeadInt(XTensor * h, int param)
+{
+    if(h != NULL)
+        return;
+    h->income.AddParam(&param, sizeof(int));
+}
+} // namespace nts(NiuTrans.Tensor)
--- a/source/XLink.h
+++ b/source/XLink.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-04
+ */
+#include <stdio.h>
+#include "XGlobal.h"
+#include "XTensor.h"
+#ifndef __XLINK_H__
+#define __XLINK_H__
+#include "XGlobal.h"
+namespace nts{ // namespace nts(NiuTrans.Tensor)
+/* cross reference */
+struct XTensor;
+#define MAX_OP_NAME_LENGTH 16
+/*
+This defines the link among tensors in networks. XLink can be
+cast as a hyperedge in a graph. when we compute on tensors, we actually create a
+network where nodes are tensors and edges the connections among them. Each connection is
+a hyperedge whose head is the output tensor and tails are input tensors. E.g,
+c = a + b
+represents a network with three nodes (a, b and c) and a hyperedge that links a and b (tails) to c (head).
+   + (=c)
+  / \
+ a   b
+for c, we have a incoming edge (a, b) -> c
+for a, we also have a edge c -> a in the reverse order (in a view of acyclic directed graphs)
+*/
+struct XLink
+{
+    /* head of the hyperedge */
+    XTensor *  head;
+    /* tails of the hyperedge */
+    XTensor ** tails;
+    /* number of tails */
+    int tailNum;
+    /* parameters used. e.g., c = a * b * \alpha 
+       scalar \alpha is the parameter */
+    void * params;
+    /* number of parameters */
+    int paramNum;
+    /* size of each parameter */
+    static int paramSize;
+    /* name of the hyperedge type. e.g., sum, mul ... */
+    char type[MAX_OP_NAME_LENGTH];
+    /* constuctor */
+    XLink();
+    /* deconstructor */
+    ~XLink();
+    /* reset it */
+    void Reset();
+    /* set edge type name */
+    void SetType(const char * typeName);
+    /* set head */
+    void SetHead(XTensor * h);
+    /* add a tail */
+    void AddTail(XTensor * t);
+    /* add two tails in one time */
+    void AddTwoTails(XTensor * t1, XTensor * t2);
+    /* add a integer parameter */
+    void AddParam(DTYPE param);
+    /* add a integer parameter */
+    void AddParam(void * param, int size);
+    /* create a hyper edge with two input tensors and a output tensor */
+    static
+    void MakeLink(XTensor * t1, XTensor * t2, XTensor * h, const char * typeName);
+    /* create a hyper edge with a list of tensors and a output tensor */
+    static
+    void MakeLink(XList * list, XTensor * h, const char * typeName);
+    /* add a parameter */
+    static
+    void AddParamToHead(XTensor * h, DTYPE param);
+    /* add an integer parameter */
+    static
+    void AddParamToHeadInt(XTensor * h, int param);
+};
+} // namespace nts(NiuTrans.Tensor)
+#endif // __XLINK_H__
\ No newline at end of file
--- a/source/XName.h
+++ b/source/XName.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University. 
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ *
+ * We define various names here
+ *
+ * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-07-05
+ * It was really HOT these days. I can't imagine what a hot day here in Shenyang!
+*/
+#ifndef __XNAME_H__
+#define __XNAME_H__
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#define MATH_MATMUL "M_MATMUL"
+#define MATH_CONCATENATESOLY "M_CONCATENATESOLY"
+#define MATH_COPYVALUES "M_COPYVALUES"
+#define MATH_MATRIXMUL "M_MATRIXMUL"
+#define MATH_MATRIXMUL2D "M_MATRIXMUL2D"
+#define MATH_MATRIXMULBATCHED "M_MATRIXMULBATCHED"
+#define MATH_MERGE "M_MERGE"
+#define MATH_MULTIPLY "M_MULTIPLY"
+#define MATH_REDUCEMAX "M_REDUCEMAX"
+#define MATH_REDUCESUM "M_REDUCESUM"
+#define MATH_SELECTRANGE "M_SELECTRANGE"
+#define MATH_SORT "M_SORT"
+#define MATH_SUM "M_SUM"
+#define MATH_TOPK "M_TOPK"
+#define MATH_UNSQUEEZE "M_UNSQUEEZE"
+} // namespace nts(NiuTrans.Tensor)
+#endif // __XNAME_H__
\ No newline at end of file
--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
@@ -38,7 +38,7 @@
 #include "XMem.h"
 #include "XHeap.h"
 #include "XBLAS.h"
-#include "core/MergeBlockLists.h"
+#include "core/shape/MergeBlockLists.h"
 #ifdef USE_CUDA
@@ -47,8 +47,8 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-#include "core/FlushToMem.cuh"
+#include "core/utilities/FlushToMem.cuh"
-#include "core/SetAscendingOrder.cuh"
+#include "core/utilities/SetAscendingOrder.cuh"
 #endif

--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -21,7 +21,7 @@
 *
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2017-07-31
 * I'm working while most of the students are enjoying their holidays :(
- * $Update by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
+ * $Updated by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2017-11-18 bug fixes
 *
 */
@@ -36,10 +36,14 @@
 #include "XList.h"
 #include "XDataType.h"
 #include "XMem.h"
+#include "XLink.h"
 /* the nts (NiuTrans.Tensor) namespace */
 namespace nts{
+/* cross reference */
+struct XLink;
 /* define the maximum number of dimensions in a tensor */
 #define MAX_TENSOR_DIM_NUM 6
 #define USE_BATCHED_STRIDED_MAT_MUL
@@ -47,9 +51,7 @@ namespace nts{
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8
-/*
+/* computation flags */
-computation flags
-*/
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX
@@ -59,7 +61,6 @@ is the parent class of XMatrix.
 */
 struct XTensor
 {
-public:
    /* memory pool */
    XMem * mem;
@@ -130,10 +131,23 @@ public:
    /* indicates whether the tensor is initialized or not */
    bool isInit;
-/*******************************************************************
+    /*
-XTensor untilities
+    the link used to form networks. Note that when we compute on tensors, we actually create a
-*/
+    network where nodes are tensors and edges the connections among them. Each connection is
-public:
+    a hyperedge whose head is the output tensor and tails are input tensors. E.g,
+    c = a + b
+    represents a network with three nodes (a, b and c) and a hyperedge that links a and b (tails) to c (head).
+    Here "income" keeps which nodes (tensors) are used to form the current node (tensor).
+    */
+    XLink income;
+    /* It keeps which nodes (tensors) we go to from the current node (tensor). */
+    XLink outgo;
+    /********************
+     XTensor untilities
+    ********************/
    /* constructor */
    XTensor();

--- a/source/XUtility.cpp
+++ b/source/XUtility.cpp
@@ -403,7 +403,7 @@ int ToCPUInt(int devID, void * value)
    }
 }
-/* set the value that is kept on a device */
+/* assign a number to a variable that is kept on a specified device */
 bool SetToDevice(int devID, void * p, DTYPE value)
 {
    if(p == NULL)
@@ -412,7 +412,7 @@ bool SetToDevice(int devID, void * p, DTYPE value)
    if(devID < 0)
        *(DTYPE*)p = value;
    else{
-        XMemCopy(p, devID, &value, -1, sizeof(DTYPE*));
+        XMemCopy(p, devID, &value, -1, sizeof(DTYPE));
    }
    return true;

--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
@@ -26,47 +26,49 @@
 #include "../XTensor.h"
-#include "Concatenate.h"
+#include "shape/Concatenate.h"
-#include "ConcatenateSolely.h"
+#include "shape/ConcatenateSolely.h"
-#include "CopyBlocks.h"
+#include "movement/CopyBlocks.h"
-#include "CopyBlocksInGrid.h"
+#include "movement/CopyBlocksInGrid.h"
-#include "CopyBlocksOnSite.h"
+#include "movement/CopyBlocksOnSite.h"
-#include "CopyData2D.h"
+#include "movement/CopyData2D.h"
-#include "CopyIndexed.h"
+#include "movement/CopyIndexed.h"
-#include "CopyInGrid.h"
+#include "movement/CopyInGrid.h"
-#include "CopyValues.h"
+#include "movement/CopyValues.h"
-#include "FlushToMem.h"
+#include "utilities/FlushToMem.h"
-#include "MakeMergeBlockIndex.h"
+#include "shape/MakeMergeBlockIndex.h"
-#include "MakeSplitBlockIndex.h"
+#include "shape/MakeSplitBlockIndex.h"
-#include "MatrixMul.h"
+#include "arithmetic/MatrixMul.h"
-#include "MatrixMul2D.h"
+#include "arithmetic/MatrixMul2D.h"
-#include "MatrixMul2DMultiTheading.h"
+#include "arithmetic/MatrixMul2DMultiTheading.h"
-#include "MatrixMul2DParallel.h"
+#include "arithmetic/MatrixMul2DParallel.h"
-#include "MatrixMulBatched.h"
+#include "arithmetic/MatrixMulBatched.h"
-#include "MatrixMULBatchedCPU.h"
+#include "arithmetic/MatrixMULBatchedCPU.h"
-#include "Merge.h"
+#include "shape/Merge.h"
-#include "MergeBlockLists.h"
+#include "shape/MergeBlockLists.h"
-#include "MultiplyElementWise.h"
+#include "arithmetic/Multiply.h"
-#include "Negate.h"
+#include "arithmetic/Negate.h"
-#include "Normalize.h"
+#include "math/Normalize.h"
-#include "Power.h"
+#include "shape/Permute.h"
-#include "ReduceMax.h"
+#include "math/Power.h"
-#include "ReduceMean.h"
+#include "reduce/ReduceMax.h"
-#include "ReduceStandardVariance.h"
+#include "reduce/ReduceMean.h"
-#include "ReduceSum.h"
+#include "reduce/ReduceStandardVariance.h"
-#include "ReduceSumSquared.h"
+#include "reduce/ReduceSum.h"
-#include "ReduceVariance.h"
+#include "reduce/ReduceSumSquared.h"
-#include "ScaleAndShift.h"
+#include "reduce/ReduceVariance.h"
-#include "Select.h"
+#include "math/ScaleAndShift.h"
-#include "SetData.h"
+#include "getandset/Select.h"
-#include "Sort.h"
+#include "getandset/SetData.h"
-#include "Split.h"
+#include "sort/Sort.h"
-#include "Sum.h"
+#include "shape/Split.h"
-#include "SumByColumnTV.h"
+#include "arithmetic/Sum.h"
-#include "SumByColumnVT.h"
+#include "arithmetic/SumByColumnTV.h"
-#include "TopK.h"
+#include "arithmetic/SumByColumnVT.h"
-#include "Unsqueeze.h"
+#include "sort/TopK.h"
-#include "XMatrixSegment.h"
+#include "shape/Transpose.h"
-#include "XTensorBLAS.h"
+#include "shape/Unsqueeze.h"
+#include "utilities/XMatrixSegment.h"
+#include "arithmetic/XTensorBLAS.h"
 #endif // __CHEADER_H__
\ No newline at end of file
--- a/source/core/XTensorCore.h
+++ b/source/core/XTensorCore.h
@@ -37,41 +37,33 @@ public:
    concatenate a list of tensors along a given dimension
    Note that this is actually a wrapper that selects "ConcatenateSolely"
    or "Merge" by means of the tensor shapes */
-	extern "C"
    void Concatenate(XList * smalls, XTensor * big, int dim);
    /* concatenate two tensors along a given dimension */
-	extern "C"
    void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim);
    /* concatenate a list of tensors along a given dimension */
-	extern "C"
    static
    void ConcatenateSolely(XList * smalls, XTensor * big, int dim);
    /* copy selected sub-tensors */
-	extern "C"
    static
    bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum);
    /* copy a number of blocks in grid */
-	extern "C"
    static
    void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev = false);
    /* copy s to t */
-	extern "C"
    static
    bool CopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);
    /* set target data block index for the data movement in merge */
-	extern "C"
    static
    void MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
        int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
    /* set target data block index for the data movement in split */
-	extern "C"
    static
    void MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
@@ -86,7 +78,6 @@ public:
    tensor of the result C. C should be a tensor of z * x * n * m. Obviously C = A * B performs
    normal matrix multiplication if A = y * z and B = x * y.
    */
-	extern "C"
    static
    void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
@@ -96,7 +87,6 @@ public:
    c = trans(a) * trans(b) * alpha + c * beta
    where trans() return the transposed matrix if the flag is fired
    */
-	extern "C"
    static
    void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
@@ -105,7 +95,6 @@ public:
    matrix multiplication for a block (x1,y1) - (x2,y2)
    where (x1,y1) is the upper-left corner and (x2,y2) is the bottom-right corner
    */
-	extern "C"
    static
    void MatrixMul2DMultiTheading(XList * args);
@@ -114,7 +103,6 @@ public:
    c = trans(a) * trans(b) * alpha + c * beta
    where trans() return the transposed matrix if the flag is fired
    */
-	extern "C"
    static
    void MatrixMul2DParallel(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
@@ -126,36 +114,29 @@ public:
    ci = trans(ai) * trans(bi) * alpha + cm * beta
    where trans() returns the transposed matrix if the flag is fired
    */
-	extern "C"
    static
    void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
    /* matrix multiplication in batch mode (CPU code) */
-	extern "C"
    static
    void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
        DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
    /* transform a tensor by merging it alone with a dimension, e.g., (M, N/3, 3) -> (M, N) */
-	extern "C"
    void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim = -1);
    /* merge small tensors into a big tensor */
-	extern "C"
    void Merge(XList * smalls, XTensor * big, int whereToMerge);
    /* merge data by blocks */
-	extern "C"
    void MergeBlockLists(XList * sourceList, int * blockSizes, int blockNum, void * target, XMem * myMem);
    /* element-wise product of two tensors */
-	extern "C"
    static
    void MultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha = 0);
    /* set every entry to its minus value */
-	extern "C"
    void Negate(XTensor * a);
    /*
@@ -163,16 +144,13 @@ public:
    y = a * (x-mean)/sqrt(variance+\epsilon) + b
    where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter.
    */
-	extern "C"
    static
    void Normalize(XTensor * input, XTensor * output, int dim, XTensor * mean, XTensor * var, XTensor * a, XTensor * b, DTYPE epsilon);
    /* get the power(x, y) */
-	extern "C"
    void Power(XTensor * a, DTYPE p);
    /* get the max value of the items along a dimension of the tensor. */
-	extern "C"
    static
    void ReduceMax(XTensor * input, XTensor * output, int dim);
@@ -180,7 +158,6 @@ public:
    get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
    mean = (1/n) * sum_i input_i
    */
-	extern "C"
    static
    void ReduceMean(XTensor * input, XTensor * output, int dim);
@@ -188,7 +165,6 @@ public:
    standard variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
    variance = (1/n * \sum_i (a_i - mean)^2)^0.5
    */
-	extern "C"
    static
    void ReduceStandardVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
@@ -197,7 +173,6 @@ public:
    sum = \sum_i (a_i - shift) if isExp == false
    sum = \sum_i exp(a_i - shift) if isExp == true
    */
-	extern "C"
    static
    void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift = NULL, DTYPE power = (DTYPE)1.0F, bool isExp = false);
@@ -205,7 +180,6 @@ public:
    squared sum of the items along a dimension of the tensor. For a 1-dimensional data array a,
    sum = \sum_i (a_i - shift)^2
    */
-	extern "C"
    static
    void ReduceSumSquared(XTensor * input, XTensor * output, int dim, XTensor * shift);
@@ -213,73 +187,59 @@ public:
    variance of the items along a dimension of the tensor. For a 1-dimensional data array a,
    variance = 1/n * \sum_i (a_i - mean)^2
    */
-	extern "C"
    static
    void ReduceVariance(XTensor * input, XTensor * output, int dim, XTensor * mean);
    /* scale and shift all tensor entires */
-	extern "C"
    static
    void ScaleAndShift(XTensor * a, DTYPE scale, DTYPE shift);
    /* transform a tensor by splitting it, e.g., (M, N) -> (M, N/3, 3) */
-	extern "C"
    void Split(XTensor * s, XTensor * t, int whereToSplit, int splitNum);
    /* split a big tensor into small tensors */
-	extern "C"
    void Split(XTensor * big, XList * smalls, int whereToSplit, int splitNum);
    /* tensor summation c = a + b * \beta */
-	extern "C"
    static
    void Sum(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
    /* sum of a tensor and a (column) vector */
-	extern "C"
    static
    void SumByColumnTV(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
    /* sum of a (column) vector and a tensor */
-	extern "C"
    static
    void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);
    /* get the top-k items along a given dimension */
-	extern "C"
    static
    void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k);
    /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
-	extern "C"
    void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
    /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
    /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
-	extern "C"
    static
    void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);
    /* segment a block into sub-blocks */
-	extern "C"
    static
    int SegmentTensor2D(int rowNum, int colNum, int blockNum, int * blockIndex);
    /* segment a block into sub-blocks */
-	extern "C"
    static
    int SegmentTensor2DInRows(int rowNum, int colNum, int blockNum, int * blockIndex);
    /* matrix multiplication (BLAS) */
-	extern "C"
    static
    void MatrixMULCPU(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
    #ifdef USE_CUDA
    /* matrix multiplication via cuda version BLAS */
-	extern "C"
    static
    void CudaBLASMatrixMUL(cublasHandle_t * handle,
        void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
@@ -288,7 +248,6 @@ public:
        int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
    /* matrix multiplication in batch mode via cuda version BLAS */
-	extern "C"
    static
    void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
        const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
@@ -297,7 +256,6 @@ public:
        int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
    /* matrix multiplication in batch and strided mode via cuda version BLAS */
-	extern "C"
    static
    void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
        const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
@@ -306,7 +264,6 @@ public:
        int count, int na, int ma, int nb, int mb, int nc, int mc, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);
    /* matrix multiplication in batch mode via cuda version BLAS */
-	extern "C"
    static
    void CudaBLASMatrixMULList(cublasHandle_t * handle, XList * a, MATRIX_TRANS_TYPE transposedA, XList * b, MATRIX_TRANS_TYPE transposedB, XList * c,
        int count, DTYPE alpha = (DTYPE)1.0, DTYPE beta = 1.0);

--- a/source/core/MatrixMULBatchedCPU.cpp
+++ b/source/core/MatrixMULBatchedCPU.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMULBatchedCPU.h"
 #include "MatrixMul2D.h"
 #include "XTensorBLAS.h"

--- a/source/core/MatrixMULBatchedCPU.h
+++ b/source/core/MatrixMULBatchedCPU.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHEDCPU_H__
 #define __MATRIXMULBATCHEDCPU_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
 #include "MatrixMULBatchedCPU.h"
@@ -58,6 +59,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order > 2!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, c, MATH_MATRIXMUL);
+    XLink::AddParamToHeadInt(c, transposedA);
+    XLink::AddParamToHeadInt(c, transposedB);
+    XLink::AddParamToHead(c, alpha);
+    XLink::AddParamToHead(c, beta);
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];

--- a/source/core/MatrixMul.h
+++ b/source/core/MatrixMul.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL_H__
 #define __MATRIXMUL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2D.cpp
+++ b/source/core/MatrixMul2D.cpp
@@ -19,7 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "MatrixMul2DParallel.h"
@@ -51,6 +52,13 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order == 2 && b->order == 2 && c->order == 2),
                  "Input tensors must have a order = 2!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, c, MATH_MATRIXMUL2D);
+    XLink::AddParamToHeadInt(c, transposedA);
+    XLink::AddParamToHeadInt(c, transposedB);
+    XLink::AddParamToHead(c, alpha);
+    XLink::AddParamToHead(c, beta);
 	int an = a->dimSize[0], am = a->dimSize[1];
 	int bn = b->dimSize[0], bm = b->dimSize[1];
 	int cn = c->dimSize[0], cm = c->dimSize[1];

--- a/source/core/MatrixMul2D.cu
+++ b/source/core/MatrixMul2D.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "XTensorBLAS.h"

--- a/source/core/MatrixMul2D.cuh
+++ b/source/core/MatrixMul2D.cuh
--- a/source/core/MatrixMul2D.h
+++ b/source/core/MatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2D_H__
 #define __MATRIXMUL2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.cpp
+++ b/source/core/MatrixMul2DMultiTheading.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DMultiTheading.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.h
+++ b/source/core/MatrixMul2DMultiTheading.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DMULTITHEADING_H__
 #define __MATRIXMUL2DMULTITHEADING_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.cpp
+++ b/source/core/MatrixMul2DParallel.cpp
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DParallel.h"
 #include "MatrixMul2DMultiTheading.h"
-#include "XMatrixSegment.h"
+#include "../utilities/XMatrixSegment.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.h
+++ b/source/core/MatrixMul2DParallel.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DPARALLEL_H__
 #define __MATRIXMUL2DPARALLEL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
+#include "../../XName.h"
 #include "MatrixMulBatched.h"
 #include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
@@ -53,6 +54,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    CheckNTErrors((a->order >= 2 && b->order >= 2 && c->order >= 2),
                  "Input tensors must have a order > 2!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, c, MATH_MATRIXMULBATCHED);
+    XLink::AddParamToHeadInt(c, transposedA);
+    XLink::AddParamToHeadInt(c, transposedB);
+    XLink::AddParamToHead(c, alpha);
+    XLink::AddParamToHead(c, beta);
    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];

--- a/source/core/MatrixMulBatched.h
+++ b/source/core/MatrixMulBatched.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHED_H__
 #define __MATRIXMULBATCHED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MultiplyElementWise.cpp
+++ b/source/core/MultiplyElementWise.cpp
@@ -19,9 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "MultiplyElementWise.h"
+#include "../../XName.h"
-#include "MultiplyElementWise.cuh"
+#include "Multiply.h"
+#include "Multiply.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -35,16 +36,21 @@ where i is the index of the item
 >> alpha - the coefficient
 >>
 */
-void MultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),
        "Unmatched tensors in multiplication!");
    CheckNTErrors((a->order == b->order && a->order == c->order), "Unmatched tensors!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, c, MATH_MULTIPLY);
+    XLink::AddParamToHeadInt(c, leadingDim);
+    XLink::AddParamToHead(c, alpha);
 #ifdef USE_CUDA
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
-        CudaMultiplyElementWise(a, b, c, leadingDim, alpha);
+        CudaMultiply(a, b, c, leadingDim, alpha);
        return;
    }
 #endif

--- a/source/core/MultiplyElementWise.cu
+++ b/source/core/MultiplyElementWise.cu
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "MultiplyElementWise.h"
+#include "Multiply.h"
-#include "MultiplyElementWise.cuh"
+#include "Multiply.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -121,7 +121,7 @@ where i is the item index
 >> alpha - the coefficient
 */
 extern "C"
-void CudaMultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
+void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha)
 {
 	int leadingDimRDI = a->order - leadingDim - 1;
    CheckNTErrors((a->unitNum <= c->unitNum && b->unitNum <= c->unitNum),

--- a/source/core/MultiplyElementWise.cuh
+++ b/source/core/MultiplyElementWise.cuh
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#ifndef __MULTIPLYELEMENTWISE_CUH__
+#ifndef __MULTIPLY_CUH__
-#define __MULTIPLYELEMENTWISE_CUH__
+#define __MULTIPLY_CUH__
-#include "MultiplyElementWise.h"
+#include "Multiply.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -42,11 +42,11 @@ void KernelMulElementWiseTensorDynamic(DTYPE * a, DTYPE * b, DTYPE * c, DTYPE al
 /* element-wise product of two tensors */
 extern "C"
-void CudaMultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha);
+void CudaMultiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim = 0, DTYPE alpha = 0);
 #endif // USE_CUDA
 } // namespace nts(NiuTrans.Tensor)
-#endif // __MULTIPLYELEMENTWISE_CUH__
+#endif // __MULTIPLY_CUH__
--- a/source/core/MultiplyElementWise.h
+++ b/source/core/MultiplyElementWise.h
@@ -19,17 +19,17 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#ifndef __MULTIPLYELEMENTWISE_H__
+#ifndef __MULTIPLY_H__
-#define __MULTIPLYELEMENTWISE_H__
+#define __MULTIPLY_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* element-wise product of two tensors */
 extern "C"
-void MultiplyElementWise(XTensor * a, XTensor * b, XTensor * c, int leadingDim, DTYPE alpha = 0);
+void Multiply(XTensor * a, XTensor * b, XTensor * c, int leadingDim = 0, DTYPE alpha = 0);
 } // namespace nts(NiuTrans.Tensor)
-#endif // __MULTIPLYELEMENTWISE_H__
+#endif // __MULTIPLY_H__
\ No newline at end of file
--- a/source/core/Negate.cpp
+++ b/source/core/Negate.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"

--- a/source/core/Negate.cu
+++ b/source/core/Negate.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"

--- a/source/core/Negate.cuh
+++ b/source/core/Negate.cuh
--- a/source/core/Negate.h
+++ b/source/core/Negate.h
@@ -22,7 +22,7 @@
 #ifndef __NEGATE_H__
 #define __NEGATE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sum.cpp
+++ b/source/core/Sum.cpp
@@ -19,7 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "Sum.h"
 #include "Sum.cuh"
@@ -37,15 +38,16 @@ void Sum(XTensor * a, XTensor * b, XTensor * c, DTYPE beta)
    if (c == NULL)
        c = a;
-    CheckNTErrors((a && b && c),
+    CheckNTErrors(a && b && c, "Empty tensors in addition!");
-        "Empty tensors in addition!");
+    CheckNTErrors(a->unitNum == b->unitNum && a->unitNum == c->unitNum,
-    CheckNTErrors((a->unitNum == b->unitNum && a->unitNum == c->unitNum),
                  "Unmatched tensors in addition!");
+    CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
-    CheckNTErrors((a->dataType == b->dataType && a->dataType == c->dataType),
                  "Unmatched tensors in addition!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, c, MATH_SUM);
+    XLink::AddParamToHead(c, beta);
    if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
 #ifdef USE_CUDA

--- a/source/core/Sum.cu
+++ b/source/core/Sum.cu
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
 #include "Sum.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sum.cuh
+++ b/source/core/Sum.cuh
--- a/source/core/Sum.h
+++ b/source/core/Sum.h
@@ -22,7 +22,7 @@
 #ifndef __SUM_H__
 #define __SUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.cpp
+++ b/source/core/SumByColumnTV.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cu
+++ b/source/core/SumByColumnTV.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cuh
+++ b/source/core/SumByColumnTV.cuh
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_CUH__
 #define __REDUCEMAX_CUH__
-#include "ReduceMax.h"
+#include "../reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.h
+++ b/source/core/SumByColumnTV.h
@@ -22,7 +22,7 @@
 #ifndef __SUMBYCOLUMNTV_H__
 #define __SUMBYCOLUMNTV_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnVT.cpp
+++ b/source/core/SumByColumnVT.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"

--- a/source/core/SumByColumnVT.cu
+++ b/source/core/SumByColumnVT.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"

--- a/source/core/SumByColumnVT.cuh
+++ b/source/core/SumByColumnVT.cuh
--- a/source/core/SumByColumnVT.h
+++ b/source/core/SumByColumnVT.h
@@ -22,7 +22,7 @@
 #ifndef __SUMBYCOLUMNVT_H__
 #define __SUMBYCOLUMNVT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.cpp
+++ b/source/core/XTensorBLAS.cpp
@@ -20,8 +20,8 @@
 */
 #include "XTensorBLAS.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XBLAS.h"
+#include "../../XBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.cu
+++ b/source/core/XTensorBLAS.cu
@@ -19,16 +19,18 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "XTensorBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* matrix multiplication via cuda version BLAS */
+/*
+matrix multiplication via cuda version BLAS
+*/
 void CudaBLASMatrixMUL(cublasHandle_t * handle,
                       void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                       void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -83,7 +85,9 @@ void CudaBLASMatrixMUL(cublasHandle_t * handle,
    }
 }
-/* matrix multiplication via cuda version BLAS */
+/*
+matrix multiplication via cuda version BLAS
+*/
 void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
                              const void ** a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA,
                              const void ** b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB,
@@ -139,6 +143,7 @@ void CudaBLASMatrixMULBatched(cublasHandle_t * handle,
 }
 /* matrix multiplication in batch and strided mode via cuda version BLAS */
+extern "C"
 void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
                                     const void * a, MATRIX_TRANS_TYPE transposedA, TENSOR_DATA_TYPE dataTypeA, long long int strideA,
                                     const void * b, MATRIX_TRANS_TYPE transposedB, TENSOR_DATA_TYPE dataTypeB, long long int strideB,
@@ -193,7 +198,9 @@ void CudaBLASMatrixMULBatchedStrided(cublasHandle_t * handle,
    }
 }
-/* matrix multiplication via cuda version BLAS */
+/*
+matrix multiplication via cuda version BLAS
+*/
 void CudaBLASMatrixMULList(cublasHandle_t * handle,
                           XList * a, MATRIX_TRANS_TYPE transposedA,
                           XList * b, MATRIX_TRANS_TYPE transposedB,

--- a/source/core/XTensorBLAS.h
+++ b/source/core/XTensorBLAS.h
@@ -22,7 +22,7 @@
 #ifndef __XTENSORBLAS_H__
 #define __XTENSORBLAS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ConvertDataType.cu
+++ b/source/core/ConvertDataType.cu
@@ -19,8 +19,8 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
 */
+#include "../../XUtility.h"
+#include "../../XName.h"
 #include "Select.h"
-#include "../XUtility.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -28,13 +29,13 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 generate a tensor with seleccted data in range[low,high] along the given dimension 
 c = select(a) 
 >> a - input tensor
+>> c - result tensor
 >> dim - the dimension along with which we do the job
 >> low - lower bound
 >> high - higher bound.
 Note that range [1,3] means that we select 1 and 2.
->> c - result tensor
 */
-void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
+void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
 {
    CheckNTErrors(a != NULL && c != NULL, "empty tensors!");
    CheckNTErrors(a->order == c->order, "The input and output tensors must in the same order!");
@@ -54,8 +55,14 @@ void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c)
        }
    }
-    int dimRDI = a->order - dim - 1;
+    /* make tensor connections */
+    XLink::MakeLink(a, NULL, c, MATH_SELECTRANGE);
+    XLink::AddParamToHeadInt(c, dim);
+    XLink::AddParamToHeadInt(c, low);
+    XLink::AddParamToHeadInt(c, high);
    int stride = 1;
+    int dimRDI = a->order - dim - 1;
    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];

--- a/source/core/Select.cu
+++ b/source/core/Select.cu
--- a/source/core/Select.cuh
+++ b/source/core/Select.cuh
--- a/source/core/Select.h
+++ b/source/core/Select.h
@@ -22,18 +22,18 @@
 #ifndef __SELECT_H__
 #define __SELECT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* generate a tensor with seleccted data c = select(a) */
 extern "C" 
-void Select(XTensor * a, XTensor * indexCPU, XTensor * c);
+void Select(XTensor * a, XTensor * c, XTensor * indexCPU);
 /* generate a tensor with seleccted data in range[low,high] along the given dimension 
   c = select(a) */
 extern "C" 
-void SelectRange(XTensor * a, int dim, int low, int high, XTensor * c);
+void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetData.cpp
+++ b/source/core/SetData.cpp
@@ -21,7 +21,7 @@
 */
 #include "SetData.h"
-#include "CopyValues.h"
+#include "../movement/CopyValues.h"
 #if !defined( WIN32 ) && !defined( _WIN32 )
    #include "sys/time.h"

--- a/source/core/SetData.cu
+++ b/source/core/SetData.cu
--- a/source/core/SetData.cuh
+++ b/source/core/SetData.cuh
--- a/source/core/SetData.h
+++ b/source/core/SetData.h
@@ -23,7 +23,7 @@
 #ifndef __SETDATA_H__
 #define __SETDATA_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Normalize.cpp
+++ b/source/core/Normalize.cpp
@@ -20,7 +20,7 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"

--- a/source/core/Normalize.cu
+++ b/source/core/Normalize.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"

--- a/source/core/Normalize.cuh
+++ b/source/core/Normalize.cuh
--- a/source/core/Normalize.h
+++ b/source/core/Normalize.h
@@ -22,7 +22,7 @@
 #ifndef __NORMALIZE_H__
 #define __NORMALIZE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Power.cpp
+++ b/source/core/Power.cpp
@@ -20,7 +20,7 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"

--- a/source/core/Power.cu
+++ b/source/core/Power.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"

--- a/source/core/Power.cuh
+++ b/source/core/Power.cuh
--- a/source/core/Power.h
+++ b/source/core/Power.h
@@ -22,7 +22,7 @@
 #ifndef __POWER_H__
 #define __POWER_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.cpp
+++ b/source/core/ScaleAndShift.cpp
--- a/source/core/ScaleAndShift.cu
+++ b/source/core/ScaleAndShift.cu
@@ -21,7 +21,7 @@
 #include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.cuh
+++ b/source/core/ScaleAndShift.cuh
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_CUH__
 #define __SCALEANDSHIFT_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.h
+++ b/source/core/ScaleAndShift.h
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_H__
 #define __SCALEANDSHIFT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocks.cpp
+++ b/source/core/CopyBlocks.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocks.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksSelected.cuh"

--- a/source/core/CopyBlocks.h
+++ b/source/core/CopyBlocks.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_H__
 #define __COPYBLOCKS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cpp
+++ b/source/core/CopyBlocksInGrid.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyBlocksInGrid.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksInGrid.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cu
+++ b/source/core/CopyBlocksInGrid.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksInGrid.h"
 #include "CopyBlocksInGrid.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cuh
+++ b/source/core/CopyBlocksInGrid.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_CUH__
 #define __COPYBLOCKSINGRID_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.h
+++ b/source/core/CopyBlocksInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_H__
 #define __COPYBLOCKSINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cpp
+++ b/source/core/CopyBlocksOnSite.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"

--- a/source/core/CopyBlocksOnSite.cu
+++ b/source/core/CopyBlocksOnSite.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cuh
+++ b/source/core/CopyBlocksOnSite.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_CUH__
 #define __COPYBLOCKS_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.h
+++ b/source/core/CopyBlocksOnSite.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSONSITE_H__
 #define __COPYBLOCKSONSITE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cu
+++ b/source/core/CopyBlocksSelected.cu
@@ -21,8 +21,8 @@
 #include "CopyBlocks.h"
 #include "CopyBlocksSelected.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cuh
+++ b/source/core/CopyBlocksSelected.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSSELECTED_CUH__
 #define __COPYBLOCKSSELECTED_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.cpp
+++ b/source/core/CopyData2D.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyData2D.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.h
+++ b/source/core/CopyData2D.h
@@ -22,7 +22,7 @@
 #ifndef __COPYDATA2D_H__
 #define __COPYDATA2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyInGrid.cpp
+++ b/source/core/CopyInGrid.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyInGrid.h"
 #include "CopyBlocksInGrid.h"

--- a/source/core/CopyInGrid.h
+++ b/source/core/CopyInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINGRID_H__
 #define __COPYINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyIndexed.cpp
+++ b/source/core/CopyIndexed.cpp
--- a/source/core/CopyIndexed.h
+++ b/source/core/CopyIndexed.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINDEXED_H__
 #define __COPYINDEXED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cpp
+++ b/source/core/CopyValues.cpp
@@ -19,6 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
+#include "../../XName.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"
@@ -41,6 +42,9 @@ bool CopyValues(XTensor * s, XTensor * t, XStream * stream)
    CheckNTErrors((t->data != NULL), "Cannot copy to an empty data array!");
    CheckNTErrors((s->unitNum == t->unitNum), "Unmatched data item number!");
+    /* make tensor connections */
+    XLink::MakeLink(s, NULL, t, MATH_COPYVALUES);
    if ((s->dataType == X_FLOAT16 && t->dataType == X_FLOAT) ||
        (s->dataType == X_FLOAT && t->dataType == X_FLOAT16)) {
        CheckNTErrors(((s->devID < 0 && t->devID < 0) || s->devID == t->devID),

--- a/source/core/CopyValues.cu
+++ b/source/core/CopyValues.cu
@@ -21,8 +21,8 @@
 #include "CopyValues.h"
 #include "CopyValues.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cuh
+++ b/source/core/CopyValues.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYVALUES_CUH__
 #define __COPYVALUES_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.h
+++ b/source/core/CopyValues.h
@@ -22,7 +22,7 @@
 #ifndef __COPYVALUES_H__
 #define __COPYVALUES_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMax.cpp
+++ b/source/core/ReduceMax.cpp
@@ -19,7 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"
@@ -52,6 +53,10 @@ void ReduceMax(XTensor * input, XTensor * output, int dim)
        }
    }
+    /* make tensor connections */
+    XLink::MakeLink(input, NULL, output, MATH_REDUCEMAX);
+    XLink::AddParamToHeadInt(output, dim);
    if(input->devID >= 0){
 #ifdef USE_CUDA
        CudaReduceMax(input, output, dim);

--- a/source/core/ReduceMax.cu
+++ b/source/core/ReduceMax.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"

--- a/source/core/ReduceMax.cuh
+++ b/source/core/ReduceMax.cuh
--- a/source/core/ReduceMax.h
+++ b/source/core/ReduceMax.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_H__
 #define __REDUCEMAX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMean.cpp
+++ b/source/core/ReduceMean.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceMean.h"

--- a/source/core/ReduceMean.h
+++ b/source/core/ReduceMean.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMEAN_H__
 #define __REDUCEMEAN_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceStandardVariance.h
+++ b/source/core/ReduceStandardVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESTANDARDVARIANCE_H__
 #define __REDUCESTANDARDVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cpp
+++ b/source/core/ReduceSum.cpp
@@ -22,6 +22,7 @@
 #include <math.h>
 #include "ReduceSum.h"
 #include "ReduceSum.cuh"
+#include "../../XName.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -58,6 +59,12 @@ void ReduceSum(XTensor * input, XTensor * output, int dim, XTensor * shift, DTYP
        }
    }
+    /* make tensor connections */
+    XLink::MakeLink(input, shift, output, MATH_REDUCESUM);
+    XLink::AddParamToHeadInt(output, dim);
+    XLink::AddParamToHead(output, power);
+    XLink::AddParamToHeadInt(output, isExp);
    if(input->devID >= 0){
 #ifdef USE_CUDA
        CudaReduceSum(input, output, dim, shift, power, isExp);

--- a/source/core/ReduceSum.cu
+++ b/source/core/ReduceSum.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceSum.cuh"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cuh
+++ b/source/core/ReduceSum.cuh
--- a/source/core/ReduceSum.h
+++ b/source/core/ReduceSum.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUM_H__
 #define __REDUCESUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSumSquared.cpp
+++ b/source/core/ReduceSumSquared.cpp
--- a/source/core/ReduceSumSquared.h
+++ b/source/core/ReduceSumSquared.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUMSQUARED_H__
 #define __REDUCESUMSQUARED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceVariance.cpp
+++ b/source/core/ReduceVariance.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceVariance.h"

--- a/source/core/ReduceVariance.h
+++ b/source/core/ReduceVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEVARIANCE_H__
 #define __REDUCEVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/Concatenate.cpp
+++ b/source/core/Concatenate.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Concatenate.h"
 #include "Merge.h"
 #include "ConcatenateSolely.h"

--- a/source/core/Concatenate.h
+++ b/source/core/Concatenate.h
@@ -22,7 +22,7 @@
 #ifndef __CONCATENATE_H__
 #define __CONCATENATE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ConcatenateSolely.cpp
+++ b/source/core/ConcatenateSolely.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
+#include "../../XName.h"
 #include "ConcatenateSolely.h"
 #include "MergeBlockLists.h"
@@ -36,6 +37,10 @@ void ConcatenateSolely(XList * smalls, XTensor * big, int dim)
 {
    CheckNTErrors((big->order > dim && dim >= 0), "Illegal dimension to concatenate!");
+    /* make tensor connections */
+    XLink::MakeLink(smalls, big, MATH_CONCATENATESOLY);
+    XLink::AddParamToHeadInt(big, dim);
    int catDimSize = 0;
    int dimRDI = big->order - dim - 1;

--- a/source/core/ConcatenateSolely.h
+++ b/source/core/ConcatenateSolely.h
@@ -22,7 +22,7 @@
 #ifndef __CONCATENATESOLELY_H__
 #define __CONCATENATESOLELY_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MakeMergeBlockIndex.cpp
+++ b/source/core/MakeMergeBlockIndex.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeMergeBlockIndex.h"
 #include "MakeMergeBlockIndex.cuh"

--- a/source/core/MakeMergeBlockIndex.cu
+++ b/source/core/MakeMergeBlockIndex.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeMergeBlockIndex.h"
 #include "MakeMergeBlockIndex.cuh"

--- a/source/core/MakeMergeBlockIndex.cuh
+++ b/source/core/MakeMergeBlockIndex.cuh
@@ -22,7 +22,7 @@
 #ifndef __CUDAMAKEMERGEBLOCKINDEX_CUH__
 #define __CUDAMAKEMERGEBLOCKINDEX_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MakeMergeBlockIndex.h
+++ b/source/core/MakeMergeBlockIndex.h
@@ -22,7 +22,7 @@
 #ifndef __MAKEMERGEBLOCKINDEX_H__
 #define __MAKEMERGEBLOCKINDEX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MakeSplitBlockIndex.cpp
+++ b/source/core/MakeSplitBlockIndex.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeSplitBlockIndex.h"
 #include "MakeSplitBlockIndex.cuh"

--- a/source/core/MakeSplitBlockIndex.cu
+++ b/source/core/MakeSplitBlockIndex.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MakeSplitBlockIndex.h"
 #include "MakeSplitBlockIndex.cuh"

--- a/source/core/MakeSplitBlockIndex.cuh
+++ b/source/core/MakeSplitBlockIndex.cuh
--- a/source/core/MakeSplitBlockIndex.h
+++ b/source/core/MakeSplitBlockIndex.h
@@ -22,7 +22,7 @@
 #ifndef __MAKESPLITBLOCKINDEX_H__
 #define __MAKESPLITBLOCKINDEX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Merge.cpp
+++ b/source/core/Merge.cpp
@@ -19,11 +19,12 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
+#include "../../XName.h"
 #include "Merge.h"
 #include "MakeMergeBlockIndex.h"
-#include "CopyBlocksOnSite.h"
+#include "../movement/CopyBlocksOnSite.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -62,6 +63,11 @@ void Merge(XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        }
    }
+    /* make tensor connections */
+    XLink::MakeLink(s, NULL, t, MATH_MERGE);
+    XLink::AddParamToHeadInt(t, whereToMerge);
+    XLink::AddParamToHeadInt(t, leadingDim);
    int blockSize = 1;
    int blockNum = 1;
    int gridSize = 1;

--- a/source/core/Merge.h
+++ b/source/core/Merge.h
@@ -22,7 +22,7 @@
 #ifndef __MERGE_H__
 #define __MERGE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MergeBlockLists.cpp
+++ b/source/core/MergeBlockLists.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "MergeBlockLists.h"
 #include "MergeBlockLists.cuh"

--- a/source/core/MergeBlockLists.cu
+++ b/source/core/MergeBlockLists.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MergeBlockLists.h"
 #include "MergeBlockLists.cuh"

--- a/source/core/MergeBlockLists.cuh
+++ b/source/core/MergeBlockLists.cuh
--- a/source/core/MergeBlockLists.h
+++ b/source/core/MergeBlockLists.h
@@ -22,7 +22,7 @@
 #ifndef __MERGEBLOCKLISTS_H__
 #define __MERGEBLOCKLISTS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/shape/Permute.cpp
+++ b/source/core/shape/Permute.cpp
--- a/source/core/shape/Permute.h
+++ b/source/core/shape/Permute.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-05
+ */
+#ifndef __PERMUTE_H__
+#define __PERMUTE_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#define permute _Permute_
+/* generate the tensor with permuted dimensions: b = permuted(a) */
+extern "C"
+void Permute(XTensor * a, XTensor * b, int * dimPermute);
+/* permute the tensor dimensions on site: a = permuted(a) */
+extern "C"
+void Permute_(XTensor * a, int * dimPermute);
+/* make a tensor with permuted dimensions: b = permuted(a) and return its pointer */
+extern "C"
+XTensor * _Permute(XTensor *a, int * dimPermute);
+/* make a tensor with permuted dimensions: b = permuted(a) and return its body */
+extern "C"
+XTensor&  _Permute_(XTensor &a, int * dimPermute);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __PERMUTE_H__
--- a/source/core/Split.cpp
+++ b/source/core/Split.cpp
@@ -19,11 +19,11 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "Split.h"
 #include "MakeSplitBlockIndex.h"
-#include "CopyBlocksOnSite.h"
+#include "../movement/CopyBlocksOnSite.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Split.h
+++ b/source/core/Split.h
@@ -22,7 +22,7 @@
 #ifndef __SPLIT_H__
 #define __SPLIT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/shape/Transpose.cpp
+++ b/source/core/shape/Transpose.cpp
--- a/source/core/shape/Transpose.h
+++ b/source/core/shape/Transpose.h
+/* NiuTrans.Tensor - an open-source tensor library
+ * Copyright (C) 2017, Natural Language Processing Lab, Northestern University.
+ * All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-05
+ * It will rain tomorrow - end of the hot days :)
+ */
+#ifndef __TRANSPOSE_H__
+#define __TRANSPOSE_H__
+#include "../../XTensor.h"
+namespace nts { // namespace nts(NiuTrans.Tensor)
+#define transpose _Transpose_
+/* generate a transposed 1D/2D tensor: b = transposed(a) */
+void Transpose(XTensor * a, XTensor * b);
+/* transpose a 1D/2D tensor on site: a = transposed(a) */
+void Transpose_(XTensor * a);
+/* make a transposed 1D/2D tensor: b = transposed(a) and return its pointer */
+XTensor * _Transpose(XTensor * a);
+/* make a transposed 1D/2D tensor: b = transposed(a) and return its body */
+XTensor & _Transpose_(XTensor & a);
+} // namespace nts(NiuTrans.Tensor)
+#endif // __TRANSPOSE_H__
--- a/source/core/Unsqueeze.cpp
+++ b/source/core/Unsqueeze.cpp
@@ -19,7 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "Unsqueeze.h"
 #include "MergeBlockLists.h"
 #include "Unsqueeze.cuh"
@@ -39,6 +40,11 @@ void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize)
    CheckNTErrors((a->order == b->order - 1), "Unmatched tensors!");
    CheckNTErrors((a->unitSize == b->unitSize), "Unmatched tensors!");
+    /* make tensor connections */
+    XLink::MakeLink(a, NULL, b, MATH_UNSQUEEZE);
+    XLink::AddParamToHeadInt(b, dim);
+    XLink::AddParamToHeadInt(b, dSize);
    int dimRDI = b->order - dim - 1;
    for (int i = 0; i < b->order; i++) {
        if (i < dimRDI) {

--- a/source/core/Unsqueeze.cu
+++ b/source/core/Unsqueeze.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Unsqueeze.h"
 #include "Unsqueeze.cuh"

--- a/source/core/Unsqueeze.cuh
+++ b/source/core/Unsqueeze.cuh
--- a/source/core/Unsqueeze.h
+++ b/source/core/Unsqueeze.h
@@ -22,7 +22,7 @@
 #ifndef __UNSQUEEZE_H__
 #define __UNSQUEEZE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sort.cpp
+++ b/source/core/Sort.cpp
@@ -19,8 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
+#include "../../XName.h"
 #include "Sort.h"
 #include "Sort.cuh"
@@ -38,8 +39,11 @@ void Sort(XTensor * a, XTensor * index, int dim)
    CheckNTErrors((a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
-	int dimRDI = a->order - dim - 1;
+    /* make tensor connections */
+    XLink::MakeLink(a, NULL, index, MATH_SORT);
+    XLink::AddParamToHeadInt(index, dim);
+	int dimRDI = a->order - dim - 1;
    /* make the index tensor */
    index->SetAscendingOrder(dim);

--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Sort.h"
 #include "Sort.cuh"

--- a/source/core/Sort.cuh
+++ b/source/core/Sort.cuh
--- a/source/core/Sort.h
+++ b/source/core/Sort.h
@@ -22,7 +22,7 @@
 #ifndef __SORT_H__
 #define __SORT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/TopK.cpp
+++ b/source/core/TopK.cpp
@@ -19,7 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
+#include "../../XName.h"
 #include "TopK.h"
 #include "TopK.cuh"
@@ -40,6 +41,11 @@ void TopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
    CheckNTErrors((index == NULL || a->order == index->order), "Unmatched input tensors!");
    CheckNTErrors((index->dataType == X_INT), "Wrong data type!");
+    /* make tensor connections */
+    XLink::MakeLink(a, b, index, MATH_TOPK);
+    XLink::AddParamToHeadInt(index, dim);
+    XLink::AddParamToHeadInt(index, k);
    int dimRDI = a->order - dim - 1;
    for (int i = 0; i < a->order; i++) {
        if (i == dimRDI) {

--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "TopK.h"
 #include "TopK.cuh"
 #include "Sort.cuh"
@@ -95,11 +95,6 @@ public:
    /* swap */
    __device__ void Swap(int i, int j)
    {
-        /*
-		CudaHeapNode<T> tmp = items[i];
-        items[i] = items[j];
-        items[j] = tmp;
-		*/
        int tmpIndex = items[i].index;
        T tmpValue = items[i].value;
        items[i] = items[j];
@@ -433,7 +428,6 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        }
    }
    /* we resort to sorting if the data cannot fit inside the shared memory */
    else {
        int dimSize[MAX_TENSOR_DIM_NUM];
@@ -449,8 +443,6 @@ void CudaTopK(XTensor * a, XTensor * b, XTensor * index, int dim, int k)
        if (a->mem != NULL)
            a->mem->ReleaseBuf(a->devID, a->unitNum * sizeof(int));
-        else
-            XMemFree(a->devID, indexA->data);
        delete indexA;
    }

--- a/source/core/TopK.cuh
+++ b/source/core/TopK.cuh
--- a/source/core/TopK.h
+++ b/source/core/TopK.h
@@ -22,7 +22,7 @@
 #ifndef __TOPK_H__
 #define __TOPK_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/FlushToMem.cpp
+++ b/source/core/FlushToMem.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2018-06-22
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "FlushToMem.h"
 #include "FlushToMem.cuh"

--- a/source/core/FlushToMem.cu
+++ b/source/core/FlushToMem.cu
@@ -20,7 +20,7 @@
 */
 #include "FlushToMem.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/FlushToMem.cuh
+++ b/source/core/FlushToMem.cuh
@@ -22,7 +22,7 @@
 #ifndef __FLUSHTOMEM_CUH__
 #define __FLUSHTOMEM_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/FlushToMem.h
+++ b/source/core/FlushToMem.h
@@ -22,7 +22,7 @@
 #ifndef __FLUSHTOMEM_H__
 #define __FLUSHTOMEM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetAscendingOrder.cu
+++ b/source/core/SetAscendingOrder.cu
@@ -20,7 +20,7 @@
 */
 #include "SetAscendingOrder.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetAscendingOrder.cuh
+++ b/source/core/SetAscendingOrder.cuh
@@ -22,7 +22,7 @@
 #ifndef __SETASCENDINGORDER_CUH__
 #define __SETASCENDINGORDER_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XMatrixSegment.cpp
+++ b/source/core/XMatrixSegment.cpp
--- a/source/core/XMatrixSegment.h
+++ b/source/core/XMatrixSegment.h
@@ -22,7 +22,7 @@
 #ifndef __XMATRIXSEGMENT_H__
 #define __XMATRIXSEGMENT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/function/Identity.cpp
+++ b/source/function/Identity.cpp
@@ -21,7 +21,7 @@
 #include "Identity.h"
 #include "../XUtility.h"
-#include "../core/CopyValues.h"
+#include "../core/movement/CopyValues.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
@@ -23,9 +23,9 @@
 #include "../XUtility.h"
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
-#include "../core/CopyValues.h"
+#include "../core/movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
@@ -22,8 +22,8 @@
 #include "LogSoftmax.h"
 #include "LogSoftmax.cuh"
 #include "Loss.cuh"
-#include "../core/ReduceSum.cuh"
+#include "../core/reduce/ReduceSum.cuh"
-#include "../core/ReduceMax.cuh"
+#include "../core/reduce/ReduceMax.cuh"
 #include "../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/function/Softmax.cpp
+++ b/source/function/Softmax.cpp
@@ -23,8 +23,8 @@
 #include "Softmax.h"
 #include "Softmax.cuh"
 #include "../XUtility.h"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
@@ -22,10 +22,10 @@
 #include "Softmax.h"
 #include "Softmax.cuh"
 #include "Loss.cuh"
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
-#include "../core/MultiplyElementWise.h"
+#include "../core/arithmetic/Multiply.h"
-#include "../core/Unsqueeze.h"
+#include "../core/shape/Unsqueeze.h"
-#include "../core/Sum.h"
+#include "../core/arithmetic/Sum.h"
 #include "../XDevice.h"
 #include "../XUtility.h"
@@ -288,7 +288,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            beta->data = mem->AllocBuf(mem->devID, beta->unitNum * beta->unitSize);
            /* \beta = \sum_i (dE/dy_i * y_i) */
-            MultiplyElementWise(dedy, y, ytmp, 0);
+            Multiply(dedy, y, ytmp, 0);
            ReduceSum(ytmp, beta, leadDim);
            /* ytmp = dE/dy_j - \beta */
@@ -296,7 +296,7 @@ void CudaSoftmaxBackward(XTensor * gold, XTensor * y, XTensor * x,
            Sum(dedy, ytmp, ytmp, -1.0F);
            /* dE/ds_j = y_j * ytmp = y_j * (dE/dy_j - \beta) */
-            MultiplyElementWise(y, ytmp, dedx, 0);
+            Multiply(y, ytmp, dedx, 0);
            mem->ReleaseBuf(mem->devID, y->unitNum * y->unitSize);
            mem->ReleaseBuf(mem->devID, beta->unitNum * beta->unitSize);

--- a/source/sample/fnnlm/FNNLM.cpp
+++ b/source/sample/fnnlm/FNNLM.cpp
@@ -485,7 +485,7 @@ float GetProb(XTensor &output, XTensor &gold, XTensor * wordProbs)
    InitTensor(&probs, &output);
    /* probs[i,j] = output[i,j] * gold[i,j] */
-    MultiplyElementWise(&output, &gold, &probs, 0);
+    Multiply(&output, &gold, &probs, 0);
    /* probability of each word */
    XTensor wprobs;

--- a/source/test/TConcatenate.h
+++ b/source/test/TConcatenate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_CONCATENATE_H__
 #define __TEST_CONCATENATE_H__
-#include "../core/Concatenate.h"
+#include "../core/shape/Concatenate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TConcatenateSolely.h
+++ b/source/test/TConcatenateSolely.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_CONCATENATESOLELY_H__
 #define __TEST_CONCATENATESOLELY_H__
-#include "../core/ConcatenateSolely.h"
+#include "../core/shape/ConcatenateSolely.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TCopyIndexed.h
+++ b/source/test/TCopyIndexed.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_COPYINDEXED_H__
 #define __TEST_COPYINDEXED_H__
-#include "../core/CopyIndexed.h"
+#include "../core/movement/CopyIndexed.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TCopyValues.h
+++ b/source/test/TCopyValues.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_COPYVALUES_H__
 #define __TEST_COPYVALUES_H__
-#include "../core/CopyValues.h"
+#include "../core/movement/CopyValues.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
@@ -19,7 +19,7 @@
 * $Created by: LI Yinqiao (email: li.yin.qiao.2012@hotmail.com) 2018-04-30
 */
-#include "../core/ScaleAndShift.h"
+#include "../core/math/ScaleAndShift.h"
 #include "../function/Loss.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMULBatchedCPU.h
+++ b/source/test/TMatrixMULBatchedCPU.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMULBATCHEDCPU_H__
 #define __TEST_MATRIXMULBATCHEDCPU_H__
-#include "../core/MatrixMULBatchedCPU.h"
+#include "../core/arithmetic/MatrixMULBatchedCPU.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul.h
+++ b/source/test/TMatrixMul.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMUL_H__
 #define __TEST_MATRIXMUL_H__
-#include "../core/MatrixMul.h"
+#include "../core/arithmetic/MatrixMul.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul2D.h
+++ b/source/test/TMatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMUL2D_H__
 #define __TEST_MATRIXMUL2D_H__
-#include "../core/MatrixMul2D.h"
+#include "../core/arithmetic/MatrixMul2D.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMul2DParallel.h
+++ b/source/test/TMatrixMul2DParallel.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMUL2DPARALLEL_H__
 #define __TEST_MATRIXMUL2DPARALLEL_H__
-#include "../core/MatrixMul2DParallel.h"
+#include "../core/arithmetic/MatrixMul2DParallel.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMatrixMulBatched.h
+++ b/source/test/TMatrixMulBatched.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MATRIXMULBATCHED_H__
 #define __TEST_MATRIXMULBATCHED_H__
-#include "../core/MatrixMulBatched.h"
+#include "../core/arithmetic/MatrixMulBatched.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMerge.h
+++ b/source/test/TMerge.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_MERGE_H__
 #define __TEST_MERGE_H__
-#include "../core/Merge.h"
+#include "../core/shape/Merge.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TMultiplyElementWise.cpp
+++ b/source/test/TMultiplyElementWise.cpp
@@ -19,7 +19,7 @@
 * $Created by: Lin Ye (email: linye2015@outlook.com) 2018-06-15
 */
-#include "TMultiplyElementWise.h"
+#include "TMultiply.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -28,7 +28,7 @@ case 1: element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i) 
 In this case, (2, 1)  (2, 1) -> (2, 1), leadingDim=0, alpha=0.
 */
-bool TestMultiplyElementWise1()
+bool TestMultiply1()
 {
 	/* a source tensor of size (2, 1) */
 	int sOrder1 = 2;
@@ -81,7 +81,7 @@ bool TestMultiplyElementWise1()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(s1, s2, t, 0);
+	Multiply(s1, s2, t, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -101,7 +101,7 @@ bool TestMultiplyElementWise1()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(sGPU1, sGPU2, tGPU, 0);
+	Multiply(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -136,7 +136,7 @@ case 2: element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)
 In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=0, alpha=0.
 */
-bool TestMultiplyElementWise2()
+bool TestMultiply2()
 {
 	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
@@ -189,7 +189,7 @@ bool TestMultiplyElementWise2()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(s1, s2, t, 0);
+	Multiply(s1, s2, t, 0);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -209,7 +209,7 @@ bool TestMultiplyElementWise2()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(sGPU1, sGPU2, tGPU, 0);
+	Multiply(sGPU1, sGPU2, tGPU, 0);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -243,7 +243,7 @@ bool TestMultiplyElementWise2()
 case 3: element-wise product of two tensors, c(i) = a(i)*b(i) + \alpha * c(i)
 In this case, (2, 2)  (2, 2) -> (2, 2), leadingDim=1, alpha=0.
 */
-bool TestMultiplyElementWise3()
+bool TestMultiply3()
 {
 	/* a source tensor of size (2, 2) */
 	int sOrder1 = 2;
@@ -296,7 +296,7 @@ bool TestMultiplyElementWise3()
 	t->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(s1, s2, t, 1);
+	Multiply(s1, s2, t, 1);
 	/* check results */
 	cpuTest = t->CheckData(answer, tUnitNum);
@@ -316,7 +316,7 @@ bool TestMultiplyElementWise3()
 	tGPU->SetZeroAll();
 	/* call MultiplyElementWise function */
-	MultiplyElementWise(sGPU1, sGPU2, tGPU, 1);
+	Multiply(sGPU1, sGPU2, tGPU, 1);
 	/* check results */
 	gpuTest = tGPU->CheckData(answer, tUnitNum);
@@ -352,13 +352,13 @@ TODO!!
 */
 /* test for MultiplyElementWise Function */
-bool TestMultiplyElementWise()
+bool TestMultiply()
 {
 	XPRINT(0, stdout, "[TEST MULTIPLYELEMENTWISE] element-wise product of two tensors \n");
 	bool returnFlag = true, caseFlag = true;
 	/* case 1 test */
-	caseFlag = TestMultiplyElementWise1();
+	caseFlag = TestMultiply1();
 	if (!caseFlag) {
 		returnFlag = false;
@@ -368,7 +368,7 @@ bool TestMultiplyElementWise()
 		XPRINT(0, stdout, ">> case 1 passed!\n");
 	/* case 2 test */
-	caseFlag = TestMultiplyElementWise2();
+	caseFlag = TestMultiply2();
 	if (!caseFlag) {
 		returnFlag = false;
@@ -378,7 +378,7 @@ bool TestMultiplyElementWise()
 		XPRINT(0, stdout, ">> case 2 passed!\n");
 	/* case 3 test */
-	caseFlag = TestMultiplyElementWise3();
+	caseFlag = TestMultiply3();
 	if (!caseFlag) {
 		returnFlag = false;

--- a/source/test/TMultiplyElementWise.h
+++ b/source/test/TMultiplyElementWise.h
@@ -22,13 +22,13 @@
 #ifndef __TEST_MULTIPLYELEMENTWISE_H__
 #define __TEST_MULTIPLYELEMENTWISE_H__
-#include "../core/MultiplyElementWise.h"
+#include "../core/arithmetic/Multiply.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* test for MultiplyElementWise Function */
 extern "C"
-bool TestMultiplyElementWise();
+bool TestMultiply();
 } // namespace nts(NiuTrans.Tensor)
 #endif // __TEST_MULTIPLYELEMENTWISE_H__
--- a/source/test/TNegate.h
+++ b/source/test/TNegate.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NEGATE_H__
 #define __TEST_NEGATE_H__
-#include "../core/Negate.h"
+#include "../core/arithmetic/Negate.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TNormalize.h
+++ b/source/test/TNormalize.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_NORMALIZE_H__
 #define __TEST_NORMALIZE_H__
-#include "../core/Normalize.h"
+#include "../core/math/Normalize.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TPower.h
+++ b/source/test/TPower.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_POWER_H__
 #define __TEST_POWER_H__
-#include "../core/Power.h"
+#include "../core/math/Power.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_REDUCEMAX_H__
 #define __TEST_REDUCEMAX_H__
-#include "../core/ReduceMax.h"
+#include "../core/reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_REDUCEMEAN_H__
 #define __TEST_REDUCEMEAN_H__
-#include "../core/ReduceMean.h"
+#include "../core/reduce/ReduceMean.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_REDUCESUM_H__
 #define __TEST_REDUCESUM_H__
-#include "../core/ReduceSum.h"
+#include "../core/reduce/ReduceSum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TReduceSumSquared.h
+++ b/source/test/TReduceSumSquared.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_REDUCESUMSQUARED_H__
 #define __TEST_REDUCESUMSQUARED_H__
-#include "../core/ReduceSumSquared.h"
+#include "../core/reduce/ReduceSumSquared.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TReduceVariance.h
+++ b/source/test/TReduceVariance.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_REDUCEVARIANCE_H__
 #define __TEST_REDUCEVARIANCE_H__
-#include "../core/ReduceVariance.h"
+#include "../core/reduce/ReduceVariance.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TScaleAndShift.h
+++ b/source/test/TScaleAndShift.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SCALEANDSHIFT_H__
 #define __TEST_SCALEANDSHIFT_H__
-#include "../core/ScaleAndShift.h"
+#include "../core/math/ScaleAndShift.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
@@ -20,7 +20,6 @@
 */
 #include "TSelect.h"
-#include "../xc/Mycode.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -74,7 +73,7 @@ bool TestSelect1()
    t->SetZeroAll();
    /* call SelectRange function */
-    SelectRange(s, 2, 1, 3, t);
+    SelectRange(s, t, 2, 1, 3);
    /* check results */
    cpuTest = t->CheckData(answer, tUnitNum);
@@ -92,7 +91,7 @@ bool TestSelect1()
    tGPU->SetZeroAll();
    /* call Select function */
-    SelectRange(sGPU, 2, 1, 3, tGPU);
+    SelectRange(sGPU, tGPU, 2, 1, 3);
    /* check results */
    gpuTest = tGPU->CheckData(answer, tUnitNum);

--- a/source/test/TSelect.h
+++ b/source/test/TSelect.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SELECT_H__
 #define __TEST_SELECT_H__
-#include "../core/Select.h"
+#include "../core/getandset/Select.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSort.h
+++ b/source/test/TSort.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SORT_H__
 #define __TEST_SORT_H__
-#include "../core/Sort.h"
+#include "../core/sort/Sort.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSplit.h
+++ b/source/test/TSplit.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SPLIT_H__
 #define __TEST_SPLIT_H__
-#include "../core/Split.h"
+#include "../core/shape/Split.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSum.h
+++ b/source/test/TSum.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SUM_H__
 #define __TEST_SUM_H__
-#include "../core/Sum.h"
+#include "../core/arithmetic/Sum.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSumByColumnTV.h
+++ b/source/test/TSumByColumnTV.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SUMBYCOLUMNTV_H__
 #define __TEST_SUMBYCOLUMNTV_H__
-#include "../core/SumByColumnTV.h"
+#include "../core/arithmetic/SumByColumnTV.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TSumByColumnVT.h
+++ b/source/test/TSumByColumnVT.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_SUMBYCOLUMNVT_H__
 #define __TEST_SUMBYCOLUMNVT_H__
-#include "../core/SumByColumnVT.h"
+#include "../core/arithmetic/SumByColumnVT.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TTopK.h
+++ b/source/test/TTopK.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_TOPK_H__
 #define __TEST_TOPK_H__
-#include "../core/TopK.h"
+#include "../core/sort/TopK.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/TUnsqueeze.h
+++ b/source/test/TUnsqueeze.h
@@ -22,7 +22,7 @@
 #ifndef __TEST_UNSQUEEZE_H__
 #define __TEST_UNSQUEEZE_H__
-#include "../core/Unsqueeze.h"
+#include "../core/shape/Unsqueeze.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
@@ -39,7 +39,7 @@ bool Test()
    //wrong = !TestMatrixMulBatched() || wrong;
    wrong = !TestMatrixMulBatchedCPU() || wrong;
    wrong = !TestMerge() || wrong;
-    wrong = !TestMultiplyElementWise() || wrong;
+    wrong = !TestMultiply() || wrong;
    wrong = !TestNegate() || wrong;
    wrong = !TestNormalize() || wrong;
    wrong = !TestPower() || wrong;

--- a/source/test/Test.h
+++ b/source/test/Test.h
@@ -32,7 +32,7 @@
 #include "TMatrixMulBatched.h"
 #include "TMatrixMULBatchedCPU.h"
 #include "TMerge.h"
-#include "TMultiplyElementWise.h"
+#include "TMultiply.h"
 #include "TNegate.h"
 #include "TNormalize.h"
 #include "TPower.h"