merged

abeb3e64 · liyinqiao · dcabc2b0 · 414ff54f · abeb3e64 · abeb3e64
Commit abeb3e64 authored Jul 08, 2018 by liyinqiao
--- a/source/XTensor.cpp
+++ b/source/XTensor.cpp
@@ -38,7 +38,7 @@
 #include "XMem.h"
 #include "XHeap.h"
 #include "XBLAS.h"
-#include "core/MergeBlockLists.h"
+#include "core/shape/MergeBlockLists.h"
 #ifdef USE_CUDA
@@ -47,8 +47,8 @@
 #include <cublas_v2.h>
 #include <cuda.h>
 #include <curand.h>
-#include "core/FlushToMem.cuh"
+#include "core/utilities/FlushToMem.cuh"
-#include "core/SetAscendingOrder.cuh"
+#include "core/utilities/SetAscendingOrder.cuh"
 #endif
@@ -555,6 +555,27 @@ bool XTensor::CheckData(const void * d, int num, int beg)
    return true;
 }
+bool XTensor::CheckData(const void * d, int num, float tolerance, int beg)
+{
+    if (data == NULL || d == NULL)
+        return false;
+    CheckNTErrors(!isSparse, "TODO");
+    CheckNTErrors(num == unitNum - beg, "Illegal size!");
+    DTYPE * valuePrt = (DTYPE*)data;
+    DTYPE value = 0;
+    DTYPE * answerPrt = (DTYPE*)d;
+    for (int i = beg; i < num; i++) {
+        value = ToCPU(devID, valuePrt);
+        if (fabs(value - *answerPrt) > tolerance)
+            return false;
+        valuePrt++;
+        answerPrt++;
+    }
+    return true;
+}
 /* 
 set the cell to the ascending order along a given dimension 
 >> dim - the dimension specified
@@ -696,6 +717,63 @@ DTYPE XTensor::Get3D(int d0, int d1, int d2)
    return ToCPU(devID, value);
 }
+/*
+get the value of a cell in a 1d tensor in int type
+>> i - index
+<< return - value of cell(i) in int
+*/
+int XTensor::Get1DInt(int i)
+{
+    CheckNTErrors((order == 1), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((i >= 0 && i < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in int type.");
+    int dimSize[1] = {i};
+    void * value = GetCell(dimSize, 1);
+    return ToCPUInt(devID, value);
+}
+/* 
+get the value of a cell in a 2d tensor in int type
+>> ni - row index
+>> mi - column index
+<< return - value of cell(ni, mi) in int
+*/
+ int XTensor::Get2DInt(int ni, int mi)
+{
+    CheckNTErrors((order == 2), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((ni >= 0 && ni < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((mi >= 0 && mi < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
+    int dims[2] = {ni, mi};
+    void * value = GetCell(dims, 2);
+    return ToCPUInt(devID, value);
+}
+/* 
+get the value of a cell in a 3d tensor in int type
+>> d0 - index of dimension 0
+>> d1 - index of dimension 1
+>> d2 - index of dimension 2
+<< return - value of cell(d0, d1, d2) in int
+*/
+int XTensor::Get3DInt(int d0, int d1, int d2)
+{
+    CheckNTErrors((order == 3), "Cannot get a 2d cell for a tensor whose order is not 2!");
+    CheckNTErrors((d0 >= 0 && d0 < dimSize[0]), "dimension 0 is out of range!");
+    CheckNTErrors((d1 >= 0 && d1 < dimSize[1]), "dimension 1 is out of range!");
+    CheckNTErrors((d2 >= 0 && d2 < dimSize[2]), "dimension 2 is out of range!");
+    CheckNTErrors((dataType == X_INT), "The tensor is not in default type.");
+    int dims[3] = {d0, d1, d2};
+    void * value = GetCell(dims, 3);
+    return ToCPUInt(devID, value);
+}
 /* 
 get the value of a cell in the sparse tensor 
 >> i - i-th tuple in the tuple list of the sparse tensor

--- a/source/XTensor.h
+++ b/source/XTensor.h
@@ -211,6 +211,9 @@ struct XTensor
    /* check whether the data array is the same as the answer */
    bool CheckData(const void * answer, int num, int beg = 0);
+    /* check whether the data array is the same as the answer */
+    bool CheckData(const void * answer, int num, float tolerance, int beg = 0);
    /* set the cell to the ascending order along a given dimension */
    void SetAscendingOrder(int dim);
@@ -220,15 +223,24 @@ struct XTensor
    /* get the pointer to a cell */
    void * GetCell(int index[], int size = -1);
-    /* get the value of a cell in a 1d tensor */
+    /* get the default type value of a cell in a 1d tensor */
    DTYPE Get1D(int i);
-    /* get the value of a cell in a 2d tensor */
+    /* get the default type value of a cell in a 2d tensor */
    DTYPE Get2D(int ni, int mi);
-    /* get the value of a cell in a 3d tensor */
+    /* get the default type value of a cell in a 3d tensor */
    DTYPE Get3D(int d0, int d1, int d2);
+    /* get the int value of a cell in a 1d tensor */
+    int Get1DInt(int i);
+    /* get the int value of a cell in a 2d tensor */
+    int Get2DInt(int ni, int mi);
+    /* get the int value of a cell in a 3d tensor */
+    int Get3DInt(int d0, int d1, int d2);
    /* get the value of a cell in a sparse tensor */
    DTYPE GetInSparse(int i);

--- a/source/core/CHeader.h
+++ b/source/core/CHeader.h
@@ -26,43 +26,49 @@
 #include "../XTensor.h"
-#include "Concatenate.h"
+#include "shape/Concatenate.h"
-#include "ConcatenateSolely.h"
+#include "shape/ConcatenateSolely.h"
-#include "CopyIndexed.h"
+#include "movement/CopyBlocks.h"
-#include "CopyInGrid.h"
+#include "movement/CopyBlocksInGrid.h"
-#include "CopyValues.h"
+#include "movement/CopyBlocksOnSite.h"
-#include "FlushToMem.h"
+#include "movement/CopyData2D.h"
-#include "MakeMergeBlockIndex.h"
+#include "movement/CopyIndexed.h"
-#include "MakeSplitBlockIndex.h"
+#include "movement/CopyInGrid.h"
-#include "MatrixMul.h"
+#include "movement/CopyValues.h"
-#include "MatrixMul2D.h"
+#include "utilities/FlushToMem.h"
-#include "MatrixMul2DMultiTheading.h"
+#include "shape/MakeMergeBlockIndex.h"
-#include "MatrixMul2DParallel.h"
+#include "shape/MakeSplitBlockIndex.h"
-#include "MatrixMulBatched.h"
+#include "arithmetic/MatrixMul.h"
-#include "MatrixMULBatchedCPU.h"
+#include "arithmetic/MatrixMul2D.h"
-#include "Merge.h"
+#include "arithmetic/MatrixMul2DMultiTheading.h"
-#include "MergeBlockLists.h"
+#include "arithmetic/MatrixMul2DParallel.h"
-#include "Multiply.h"
+#include "arithmetic/MatrixMulBatched.h"
-#include "Negate.h"
+#include "arithmetic/MatrixMULBatchedCPU.h"
-#include "Normalize.h"
+#include "shape/Merge.h"
-#include "Permute.h"
+#include "shape/MergeBlockLists.h"
-#include "Power.h"
+#include "arithmetic/Multiply.h"
-#include "ReduceMax.h"
+#include "arithmetic/Negate.h"
-#include "ReduceMean.h"
+#include "math/Normalize.h"
-#include "ReduceStandardVariance.h"
+#include "shape/Permute.h"
-#include "ReduceSum.h"
+#include "math/Power.h"
-#include "ReduceSumSquared.h"
+#include "reduce/ReduceMax.h"
-#include "ReduceVariance.h"
+#include "reduce/ReduceMean.h"
-#include "ScaleAndShift.h"
+#include "reduce/ReduceStandardVariance.h"
-#include "SetData.h"
+#include "reduce/ReduceSum.h"
-#include "Sort.h"
+#include "reduce/ReduceSumSquared.h"
-#include "Split.h"
+#include "reduce/ReduceVariance.h"
-#include "Sum.h"
+#include "math/ScaleAndShift.h"
-#include "SumByColumnTV.h"
+#include "getandset/Select.h"
-#include "SumByColumnVT.h"
+#include "getandset/SetData.h"
-#include "TopK.h"
+#include "sort/Sort.h"
-#include "Unsqueeze.h"
+#include "shape/Split.h"
-#include "XMatrixSegment.h"
+#include "arithmetic/Sum.h"
-#include "XTensorBLAS.h"
+#include "arithmetic/SumByColumnTV.h"
+#include "arithmetic/SumByColumnVT.h"
+#include "sort/TopK.h"
+#include "shape/Transpose.h"
+#include "shape/Unsqueeze.h"
+#include "utilities/XMatrixSegment.h"
+#include "arithmetic/XTensorBLAS.h"
 #endif // __CHEADER_H__
\ No newline at end of file
--- a/source/core/XTensorCore.h
+++ b/source/core/XTensorCore.h
@@ -219,9 +219,8 @@ public:
    /* insert a dimension by copying the blocks for x times (where x is the size of the inerted dimension) */
    void Unsqueeze(XTensor * a, XTensor * b, int dim, int dSize);
-    /*******************************************************************
+    /* segmentation and parallel processing for 2d tensors (i.e., matrices) */
-    segmentation and parallel processing for 2d tensors (i.e., matrices)
-    */
    /* segment a 2d tensor (i.e., matrix) into blocks and run jobs in parallel */
    static
    void RunParallel2D(XPRunner * parallelRunner, void * job, int opNum, int rowNum, int colNum, int argNum, ...);

--- a/source/core/MatrixMULBatchedCPU.cpp
+++ b/source/core/MatrixMULBatchedCPU.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMULBatchedCPU.h"
 #include "MatrixMul2D.h"
 #include "XTensorBLAS.h"
@@ -33,9 +33,9 @@ c_i = trans(a_i) * trans(b_i) * \alpha + c_i * \beta for each i in [0,count-1]
 >> transposedA - indicate whether the matrix a is transposed
 >> b - another list of input matrices (2d tensors)
 >> transposedB - indicate whether the matrix b is transposed
+>> c - output matrix (2d tensor)
 >> alpha - scalar
 >> beta - scalar
->> c - output matrix (2d tensor)
 */
 void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
                         XList * b, MATRIX_TRANS_TYPE transposedB,
@@ -64,10 +64,6 @@ void MatrixMULBatchedCPU(XList * a, MATRIX_TRANS_TYPE transposedA,
        }
    }
-    //if(isUniform){
-    //}
-    //else{
    for (int i = 0; i < a->count; i++) {
        XTensor * ai = (XTensor*)a->GetItem(i);
        XTensor * bi = (XTensor*)b->GetItem(i);

--- a/source/core/MatrixMULBatchedCPU.h
+++ b/source/core/MatrixMULBatchedCPU.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHEDCPU_H__
 #define __MATRIXMULBATCHEDCPU_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul.cpp
+++ b/source/core/MatrixMul.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMul.h"
 #include "MatrixMul2D.h"
 #include "MatrixMULBatchedCPU.h"
@@ -65,13 +65,12 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XLink::AddParamToHeadInt(c, transposedB);
    XLink::AddParamToHead(c, alpha);
    XLink::AddParamToHead(c, beta);
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
+    int cn = c->dimSizeRDI[1];
-    int cn = c->dimSize[0];
+    int cm = c->dimSizeRDI[0];
-    int cm = c->dimSize[1];
    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -87,13 +86,13 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    int cBlockNum = 1;
    for (int i = 2; i < a->order; i++) {
-        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
+        CheckNTErrors((a->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + b->order]), "Incorrect tensor sizes!");
        aBlockNum *= a->dimSizeRDI[i];
        cBlockNum *= a->dimSizeRDI[i];
    }
    for (int i = 2; i < b->order; i++) {
-        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i - 2 + a->order]), "Incorrect tensor sizes!");
+        CheckNTErrors((b->dimSizeRDI[i] == c->dimSizeRDI[i]), "Incorrect tensor sizes!");
        bBlockNum *= b->dimSizeRDI[i];
        cBlockNum *= b->dimSizeRDI[i];
    }
@@ -101,9 +100,9 @@ void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSize[0], a->dimSize[1] };
+    int aDimSize[2] = { a->dimSizeRDI[1], a->dimSizeRDI[0] };
-    int bDimSize[2] = { -b->dimSize[0], b->dimSize[1] };
+    int bDimSize[2] = { b->dimSizeRDI[1], b->dimSizeRDI[0] };
-    int cDimSize[2] = { -c->dimSize[0], c->dimSize[1] };
+    int cDimSize[2] = { c->dimSizeRDI[1], c->dimSizeRDI[0] };
    bool isSparseMul = false;

--- a/source/core/MatrixMul.h
+++ b/source/core/MatrixMul.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL_H__
 #define __MATRIXMUL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
@@ -39,7 +39,7 @@ normal matrix multiplication if A = y * z and B = x * y.
 */
 extern "C"
 void MatrixMul(XTensor * a, MATRIX_TRANS_TYPE transposedA, XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
-    DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
+               DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
 } // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2D.cpp
+++ b/source/core/MatrixMul2D.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "MatrixMul2DParallel.h"
@@ -112,7 +112,7 @@ void MatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            int num = *((int*)b->data);
            char * p = (char*)b->data + sizeof(int); // pointer to the first tuple
-                                                            /* a * b */
+            /* a * b */
            if (transposedA == X_NOTRANS && transposedB == X_NOTRANS) {
                for (int i = 0; i < num; i++) {
                    int key = *((int*)p);

--- a/source/core/MatrixMul2D.cu
+++ b/source/core/MatrixMul2D.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2D.h"
 #include "MatrixMul2D.cuh"
 #include "XTensorBLAS.h"
@@ -37,11 +37,13 @@ c = a * b * \alpha
 >> aColSize - column size of matrix a
 >> aRowSize - row size of matrix a
 >> b - a sparse matrix
->> transposedA - indicates whether b is transposed
+>> transposedB - indicates whether b is transposed
 >> bNonZeroNum - number of non-zero items in b
 >> bColSize - column size of matrix b
 >> bRowSize - row size of matrix b
 >> c - the resulting (dense) matrix
+>> cColSize - column size of matrix c
+>> cRowSize - row size of matrix c
 >> alpha - the scaling factor
 */
 extern "C" __global__
@@ -147,7 +149,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    if (!a->isSparse && !b->isSparse) {
        CheckNTErrors((!c->isSparse), "Illegal use of sparse matrix in multiplication!");
-        //cublasHandle_t * handle = GDevs->GetCudaHandle(a->devID);
        cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
        /* !!!! might have problems */
@@ -183,7 +184,6 @@ void CudaMatrixMul2D(XTensor * a, MATRIX_TRANS_TYPE transposedA,
            if (beta == 0)
                c->SetZeroAll();
            else if (beta != 1.0F) {
-                //XTensor::ScaleAndShift(c, beta, 0);
                ShowNTErrors("TODO!");
            }

--- a/source/core/MatrixMul2D.cuh
+++ b/source/core/MatrixMul2D.cuh
--- a/source/core/MatrixMul2D.h
+++ b/source/core/MatrixMul2D.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2D_H__
 #define __MATRIXMUL2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.cpp
+++ b/source/core/MatrixMul2DMultiTheading.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DMultiTheading.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DMultiTheading.h
+++ b/source/core/MatrixMul2DMultiTheading.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DMULTITHEADING_H__
 #define __MATRIXMUL2DMULTITHEADING_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.cpp
+++ b/source/core/MatrixMul2DParallel.cpp
@@ -19,10 +19,10 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "MatrixMul2DParallel.h"
 #include "MatrixMul2DMultiTheading.h"
-#include "XMatrixSegment.h"
+#include "../utilities/XMatrixSegment.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMul2DParallel.h
+++ b/source/core/MatrixMul2DParallel.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMUL2DPARALLEL_H__
 #define __MATRIXMUL2DPARALLEL_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/MatrixMulBatched.cpp
+++ b/source/core/MatrixMulBatched.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "MatrixMulBatched.h"
 #include "MatrixMULBatchedCPU.h"
 #include "XTensorBLAS.h"
@@ -41,6 +41,7 @@ where trans() returns the transposed matrix if the flag is fired
 >> c - where we keep a*b
 >> alpha - a coefficient
 >> beta - another coefficient
+>> parallelRunner - parallel processing module
 */
 void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
                      XTensor * b, MATRIX_TRANS_TYPE transposedB,
@@ -59,13 +60,12 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XLink::AddParamToHeadInt(c, transposedB);
    XLink::AddParamToHead(c, alpha);
    XLink::AddParamToHead(c, beta);
+    int an = transposedA == X_TRANS ? a->dimSizeRDI[0] : a->dimSizeRDI[1];
-    int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
+    int am = transposedA == X_TRANS ? a->dimSizeRDI[1] : a->dimSizeRDI[0];
-    int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
+    int bn = transposedB == X_TRANS ? b->dimSizeRDI[0] : b->dimSizeRDI[1];
-    int bn = transposedB == X_TRANS ? b->dimSize[1] : b->dimSize[0];
+    int bm = transposedB == X_TRANS ? b->dimSizeRDI[1] : b->dimSizeRDI[0];
-    int bm = transposedB == X_TRANS ? b->dimSize[0] : b->dimSize[1];
+    int cn = c->dimSizeRDI[1];
-    int cn = c->dimSize[0];
+    int cm = c->dimSizeRDI[0];
-    int cm = c->dimSize[1];
    CheckNTErrors((am == bn && an == cn && bm == cm),
        "Unmatched tensors in multiplication!");
@@ -87,9 +87,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
    XList * aList = new XList(10);
    XList * bList = new XList(10);
    XList * cList = new XList(10);
-    int aDimSize[2] = { -a->dimSizeRDI[0], a->dimSizeRDI[1] };
+    int aDimSize[2] = { -a->dimSizeRDI[1], a->dimSizeRDI[0] };
-    int bDimSize[2] = { -b->dimSizeRDI[0], b->dimSizeRDI[1] };
+    int bDimSize[2] = { -b->dimSizeRDI[1], b->dimSizeRDI[0] };
-    int cDimSize[2] = { -c->dimSizeRDI[0], c->dimSizeRDI[1] };
+    int cDimSize[2] = { -c->dimSizeRDI[1], c->dimSizeRDI[0] };
    for (int p = 0; p < blockNum; p++) {
        void * ap = (char*)a->data + aRealBlockSize * p;
@@ -114,8 +114,9 @@ void MatrixMulBatched(XTensor * a, MATRIX_TRANS_TYPE transposedA,
        int devIDBackup;
        ProtectCudaDev(a->devID, devIDBackup);
-        CudaBLASMatrixMULList(a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID),
+        cublasHandle_t * handle = a->mem != NULL ? a->mem->GetCublasHandle() : GDevs.GetCudaHandle(a->devID);
-                              aList, transposedA,
+        CudaBLASMatrixMULList(handle,
+							  aList, transposedA,
                              bList, transposedB,
                              cList, aList->count,
                              alpha, beta);

--- a/source/core/MatrixMulBatched.h
+++ b/source/core/MatrixMulBatched.h
@@ -22,7 +22,7 @@
 #ifndef __MATRIXMULBATCHED_H__
 #define __MATRIXMULBATCHED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Multiply.cpp
+++ b/source/core/Multiply.cpp
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 element-wise product of two tensors
 c(i) = a(i)*b(i) + \alpha * c(i)

--- a/source/core/Multiply.cu
+++ b/source/core/Multiply.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Multiply.h"
 #include "Multiply.cuh"
@@ -68,6 +68,7 @@ where |a_lead| means the size of the leading dimension of a
 >> a - tensor a
 >> b - tensor b
 >> c - result tensor
+>> alpha - the coefficient
 >> stride - the number of items we go over when move next along the leading dimension in a block
 >> ldSizeA - size of the leading dimension of a
 >> ldSizeB - size of the leading dimension of b

--- a/source/core/Multiply.cuh
+++ b/source/core/Multiply.cuh
--- a/source/core/Multiply.h
+++ b/source/core/Multiply.h
@@ -22,7 +22,7 @@
 #ifndef __MULTIPLY_H__
 #define __MULTIPLY_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Negate.cpp
+++ b/source/core/Negate.cpp
@@ -19,15 +19,15 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
-    set every entry to its minus value
+set every entry to its minus value
-    >> a - the tensor we are processing
+>> a - the tensor we are processing
 */
 void Negate(XTensor * a)
 {

--- a/source/core/Negate.cu
+++ b/source/core/Negate.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Negate.h"
 #include "Negate.cuh"
@@ -42,10 +42,10 @@ void KernelNegate(DTYPE * d, int size)
 }
 /*
-    set each entry to its negtive value (CUDA Kernel)
+set each entry to its negtive value (CUDA Kernel)
-    This is for float16 computation
+This is for float16 computation
-    >> d - pointer to the data array
+>> d - pointer to the data array
-    >> size - size of the data array
+>> size - size of the data array
 */
 __global__
 void KernelNegate(__half * d, int size)

--- a/source/core/Negate.cuh
+++ b/source/core/Negate.cuh
--- a/source/core/Negate.h
+++ b/source/core/Negate.h
@@ -22,7 +22,7 @@
 #ifndef __NEGATE_H__
 #define __NEGATE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Sum.cpp
+++ b/source/core/Sum.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Sum.h"
 #include "Sum.cuh"

--- a/source/core/Sum.cu
+++ b/source/core/Sum.cu
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
 #include "Sum.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 summation of data arrays (CUDA Kernel)
 c = a  + b * \beta

--- a/source/core/Sum.cuh
+++ b/source/core/Sum.cuh
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-		/* summation of data arrays (CUDA Kernel) */
+/* summation of data arrays (CUDA Kernel) */
 extern "C" __global__
 void KernelADD(DTYPE * a, DTYPE * b, DTYPE * c, int size, DTYPE beta = (DTYPE)1.0);

--- a/source/core/Sum.h
+++ b/source/core/Sum.h
@@ -22,7 +22,7 @@
 #ifndef __SUM_H__
 #define __SUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.cpp
+++ b/source/core/SumByColumnTV.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cu
+++ b/source/core/SumByColumnTV.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnTV.h"
 #include "SumByColumnTV.cuh"

--- a/source/core/SumByColumnTV.cuh
+++ b/source/core/SumByColumnTV.cuh
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_CUH__
 #define __REDUCEMAX_CUH__
-#include "ReduceMax.h"
+#include "../reduce/ReduceMax.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnTV.h
+++ b/source/core/SumByColumnTV.h
@@ -22,7 +22,7 @@
 #ifndef __SUMBYCOLUMNTV_H__
 #define __SUMBYCOLUMNTV_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/SumByColumnVT.cpp
+++ b/source/core/SumByColumnVT.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"

--- a/source/core/SumByColumnVT.cu
+++ b/source/core/SumByColumnVT.cu
@@ -19,14 +19,15 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "SumByColumnVT.h"
 #include "SumByColumnVT.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 summation of a vector (column vector) and a tensor
 c = a + \sum{col} b_col * \beta

--- a/source/core/SumByColumnVT.cuh
+++ b/source/core/SumByColumnVT.cuh
--- a/source/core/SumByColumnVT.h
+++ b/source/core/SumByColumnVT.h
@@ -22,11 +22,10 @@
 #ifndef __SUMBYCOLUMNVT_H__
 #define __SUMBYCOLUMNVT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /* sum of a (column) vector and a tensor */
 extern "C"
 void SumByColumnVT(XTensor * a, XTensor * b, XTensor * c = NULL, DTYPE beta = (DTYPE)1.0);

--- a/source/core/XTensorBLAS.cpp
+++ b/source/core/XTensorBLAS.cpp
@@ -20,8 +20,8 @@
 */
 #include "XTensorBLAS.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XBLAS.h"
+#include "../../XBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.cu
+++ b/source/core/XTensorBLAS.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "XTensorBLAS.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/XTensorBLAS.h
+++ b/source/core/XTensorBLAS.h
@@ -22,7 +22,7 @@
 #ifndef __XTENSORBLAS_H__
 #define __XTENSORBLAS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ConvertDataType.cu
+++ b/source/core/ConvertDataType.cu
@@ -19,8 +19,8 @@
 * $Created by: LI Yinqiao (li.yin.qiao.2012@hotmail.com) 2018-06-14
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Select.cpp
+++ b/source/core/Select.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-04
 */
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "Select.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -33,7 +33,7 @@ c = select(a)
 >> dim - the dimension along with which we do the job
 >> low - lower bound
 >> high - higher bound.
-          Note that range [1,3] means that we select 1 and 2.
+Note that range [1,3] means that we select 1 and 2.
 */
 void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
 {
@@ -48,7 +48,7 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    for(int i = 0; i < a->order; i++){
        if(i == dim){
            CheckNTErrors(low > 0 && low < a->dimSize[dim], "Illegal range specified!");
-            CheckNTErrors(high > 0 && high < a->dimSize[dim], "Illegal range specified!");
+            CheckNTErrors(high > 0 && high <= a->dimSize[dim], "Illegal range specified!");
        }
        else{
            CheckNTErrors(a->dimSize[i] == c->dimSize[i], "The size of the dimensions should be same!");
@@ -62,20 +62,24 @@ void SelectRange(XTensor * a, XTensor * c, int dim, int low, int high)
    XLink::AddParamToHeadInt(c, high);
    int stride = 1;
-    for(int i = 0; i < dim; i++)
+    int dimRDI = a->order - dim - 1;
+    for(int i = 0; i < dimRDI; i++)
        stride *= a->dimSizeRDI[i];
+    int copyTimes = 1;
+    for (int i = dimRDI + 1; i < a->order; i++) 
+        copyTimes *= a->dimSizeRDI[i];
    int blockSize = stride * (high - low) * a->unitSize;
    int stepSizeS = stride * a->dimSize[dim] * a->unitSize;
    int stepSizeT = stride * c->dimSize[dim] * a->unitSize;
    char * s = (char*)a->data + stride * low * a->unitSize;
    char * t = (char*)c->data;
-    for(int i = 0; i < high - low; i++){
+    for(int i = 0; i < copyTimes; i++){
        XMemCopy(t, c->devID, s, a->devID, blockSize);
        s += stepSizeS;
        t += stepSizeT;
    }
 }
 } // namespace nts(NiuTrans.Tensor)
--- a/source/core/Select.cu
+++ b/source/core/Select.cu
--- a/source/core/Select.cuh
+++ b/source/core/Select.cuh
--- a/source/core/Select.h
+++ b/source/core/Select.h
@@ -22,7 +22,7 @@
 #ifndef __SELECT_H__
 #define __SELECT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/SetData.cpp
+++ b/source/core/SetData.cpp
@@ -21,7 +21,7 @@
 */
 #include "SetData.h"
-#include "CopyValues.h"
+#include "../movement/CopyValues.h"
 #if !defined( WIN32 ) && !defined( _WIN32 )
    #include "sys/time.h"
@@ -68,10 +68,11 @@ void SetDataRand(XTensor * tensor, DTYPE low, DTYPE high)
            ShowNTErrors("TODO");
        }
    }
-    /* GPU code
+    /* 
-       The trick here is that initialize the data on a temperary tensor on CPU.
+    GPU code
-       The CPU data is then copied to GPU.
+    The trick here is that initialize the data on a temperary tensor on CPU.
-       TODO: generate data points on GPUs straightforwardly.
+    The CPU data is then copied to GPU.
+    TODO: generate data points on GPUs straightforwardly.
    */
    else{
        XTensor * t2 = NewTensor(tensor->order, tensor->dimSize, tensor->dataType, tensor->denseRatio, -1);

--- a/source/core/SetData.cu
+++ b/source/core/SetData.cu
--- a/source/core/SetData.cuh
+++ b/source/core/SetData.cuh
--- a/source/core/SetData.h
+++ b/source/core/SetData.h
@@ -23,7 +23,7 @@
 #ifndef __SETDATA_H__
 #define __SETDATA_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Normalize.cpp
+++ b/source/core/Normalize.cpp
@@ -20,11 +20,12 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b

--- a/source/core/Normalize.cu
+++ b/source/core/Normalize.cu
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Normalize.h"
 #include "Normalize.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
 /*
 normalized the data with normal distribution (kernel code). For an input x,

--- a/source/core/Normalize.cuh
+++ b/source/core/Normalize.cuh
@@ -28,7 +28,8 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/* normalized the data with normal distribution (Kernel code). For an input x,
+/* 
+normalized the data with normal distribution (Kernel code). For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */
@@ -37,7 +38,8 @@ void KernelNormalize(DTYPE * input, DTYPE * output, DTYPE * mean, DTYPE * var,
    DTYPE * a, DTYPE * b, DTYPE epsilon,
    int stride, int strideNum, int blockNum);
-/* normalized the data with normal distribution. For an input x,
+/* 
+normalized the data with normal distribution. For an input x,
 y = a * (x-mean)/sqrt(variance+\epsilon) + b
 where a and b are the scalar and bias respectively, and \epsilon is the adjustment parameter
 */

--- a/source/core/Normalize.h
+++ b/source/core/Normalize.h
@@ -22,7 +22,7 @@
 #ifndef __NORMALIZE_H__
 #define __NORMALIZE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/Power.cpp
+++ b/source/core/Power.cpp
@@ -20,15 +20,16 @@
 */
 #include <math.h>
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 get the power(a, p)
 >> a - the tensor
->> power - as it is
+>> p - as it is
 */
 void Power(XTensor * a, DTYPE p)
 {

--- a/source/core/Power.cu
+++ b/source/core/Power.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Power.h"
 #include "Power.cuh"
@@ -87,9 +87,6 @@ __global__
 void KernelPower(__half * d, __half p, int size)
 {
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-    //int i = blockDim.x * blockIdx.x + threadIdx.x;
-    //if (i < size)
-    //    d[i] = hpow(d[i], p);
 #else
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < size)
@@ -126,9 +123,6 @@ void CudaPower(XTensor * a, DTYPE p)
        }
        else if (p != (DTYPE)1.0) {
            ShowNTErrors("TODO!");
-            //unsigned short p2 = FloatToFloat16(p);
-            //__half * pp = (__half*)&p2;
-            //KernelPower<<<blocks, threads>>>((__half*)a->data, *pp, a->unitNum);
        }
    }
    else {

--- a/source/core/Power.cuh
+++ b/source/core/Power.cuh
--- a/source/core/Power.h
+++ b/source/core/Power.h
@@ -22,7 +22,7 @@
 #ifndef __POWER_H__
 #define __POWER_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.cpp
+++ b/source/core/ScaleAndShift.cpp
@@ -26,9 +26,7 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 scale and shift all tensor entires
 p = p * scale + shift
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/ScaleAndShift.cu
+++ b/source/core/ScaleAndShift.cu
@@ -21,7 +21,7 @@
 #include "ScaleAndShift.h"
 #include "ScaleAndShift.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -80,9 +80,7 @@ void KernelScaleAndShift(__half * d, int size, __half scale, __half shift)
 /* 
 scale and shift all matrix entires
 p = p * scale + shift
 >> a - the tensor
 >> scale - the scaler factor
 >> shift - the shift factor

--- a/source/core/ScaleAndShift.cuh
+++ b/source/core/ScaleAndShift.cuh
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_CUH__
 #define __SCALEANDSHIFT_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ScaleAndShift.h
+++ b/source/core/ScaleAndShift.h
@@ -22,7 +22,7 @@
 #ifndef __SCALEANDSHIFT_H__
 #define __SCALEANDSHIFT_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocks.cpp
+++ b/source/core/CopyBlocks.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocks.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksSelected.cuh"
@@ -78,9 +78,11 @@ void CopyBlocks(void * source, int blockSize, int * sourceBlocks, int blockNum, 
    else {
        int devID = myMem != NULL ? myMem->devID : -1;
-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0; i < blockNum; i++) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + sourceBlocks[i] * blockSize, devID, blockSize);

--- a/source/core/CopyBlocks.h
+++ b/source/core/CopyBlocks.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_H__
 #define __COPYBLOCKS_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cpp
+++ b/source/core/CopyBlocksInGrid.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyBlocksInGrid.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksInGrid.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cu
+++ b/source/core/CopyBlocksInGrid.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksInGrid.h"
 #include "CopyBlocksInGrid.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.cuh
+++ b/source/core/CopyBlocksInGrid.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_CUH__
 #define __COPYBLOCKSINGRID_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksInGrid.h
+++ b/source/core/CopyBlocksInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSINGRID_H__
 #define __COPYBLOCKSINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cpp
+++ b/source/core/CopyBlocksOnSite.cpp
@@ -19,12 +19,13 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 /*
 copy a number of blocks to target positions. Here we assume that
 all the data has been on the device (CPU/GPU) already.
@@ -47,9 +48,11 @@ void CopyBlocksOnSite(void * source, int blockSize, int blockNum, void * target,
    else {
        int devID = myMem != NULL ? myMem->devID : -1;
-        /* The following code should be fine with GPUs, but too many
+        /* 
+        The following code should be fine with GPUs, but too many
        kernel calls would slow down the system. We prefer to use
-        one kernel to do block copy in batch (kernel fusion). */
+        one kernel to do block copy in batch (kernel fusion). 
+        */
        for (int i = 0, b = 0; i < blockNum; i++, b += blockSize) {
            XMemCopy((char*)target + targetBlocks[i] * blockSize, devID,
                (char*)source + b, devID, blockSize);

--- a/source/core/CopyBlocksOnSite.cu
+++ b/source/core/CopyBlocksOnSite.cu
@@ -21,7 +21,7 @@
 #include "CopyBlocksOnSite.h"
 #include "CopyBlocksOnSite.cuh"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.cuh
+++ b/source/core/CopyBlocksOnSite.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKS_CUH__
 #define __COPYBLOCKS_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksOnSite.h
+++ b/source/core/CopyBlocksOnSite.h
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSONSITE_H__
 #define __COPYBLOCKSONSITE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cu
+++ b/source/core/CopyBlocksSelected.cu
@@ -21,8 +21,8 @@
 #include "CopyBlocks.h"
 #include "CopyBlocksSelected.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyBlocksSelected.cuh
+++ b/source/core/CopyBlocksSelected.cuh
@@ -22,7 +22,7 @@
 #ifndef __COPYBLOCKSSELECTED_CUH__
 #define __COPYBLOCKSSELECTED_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.cpp
+++ b/source/core/CopyData2D.cpp
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyData2D.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyData2D.h
+++ b/source/core/CopyData2D.h
@@ -22,7 +22,7 @@
 #ifndef __COPYDATA2D_H__
 #define __COPYDATA2D_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyInGrid.cpp
+++ b/source/core/CopyInGrid.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "CopyInGrid.h"
 #include "CopyBlocksInGrid.h"
@@ -34,7 +34,7 @@ i.e., reorder the data blocks in the same memory piece
 in the k-th grid
 >> blockDim - leading dimension of blocks
 >> blockNumInGrid - number of blocks in each grid
->> isOnDev - indicates whether the index is on the device already
+>> isIndexOnDev - indicates whether the index is on the device already
 */
 void CopyInGrid(XTensor * s, XTensor * t, int * index, int blockDim, int blockNumInGrid, bool isIndexOnDev)
 {

--- a/source/core/CopyInGrid.h
+++ b/source/core/CopyInGrid.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINGRID_H__
 #define __COPYINGRID_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyIndexed.cpp
+++ b/source/core/CopyIndexed.cpp
@@ -36,6 +36,7 @@ copy indexed sub-tensors
 >> tgtIndex - index of the target sub-tensors
 >> copyNum - number of the sub-tensors we copy for each source index, e.g.,
 for srcIndex = [1,4] and copyNum = 2, we actually copy the source sub-tensors 1, 2, 4, 5
+<< return - whether copy indexed operation was successful
 */
 bool CopyIndexed(XTensor * s, XTensor * t, int dim, int * srcIndex, int indexSize, int * tgtIndex, int copyNum)
 {

--- a/source/core/CopyIndexed.h
+++ b/source/core/CopyIndexed.h
@@ -22,7 +22,7 @@
 #ifndef __COPYINDEXED_H__
 #define __COPYINDEXED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cpp
+++ b/source/core/CopyValues.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XName.h"
+#include "../../XName.h"
 #include "CopyValues.h"
 #include "CopyValues.cuh"

--- a/source/core/CopyValues.cu
+++ b/source/core/CopyValues.cu
@@ -21,8 +21,8 @@
 #include "CopyValues.h"
 #include "CopyValues.cuh"
-#include "../XUtility.h"
+#include "../../XUtility.h"
-#include "../XDevice.h"
+#include "../../XDevice.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/CopyValues.cuh
+++ b/source/core/CopyValues.cuh
@@ -22,13 +22,12 @@
 #ifndef __COPYVALUES_CUH__
 #define __COPYVALUES_CUH__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)
 #ifdef USE_CUDA
-/**************************************/
 /* copy all elements from a source matrix to a target matrix */
 extern "C"
 bool CudaCopyValues(XTensor * s, XTensor * t, XStream * stream = NULL);

--- a/source/core/CopyValues.h
+++ b/source/core/CopyValues.h
@@ -22,7 +22,7 @@
 #ifndef __COPYVALUES_H__
 #define __COPYVALUES_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMax.cpp
+++ b/source/core/ReduceMax.cpp
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XName.h"
+#include "../../XName.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"

--- a/source/core/ReduceMax.cu
+++ b/source/core/ReduceMax.cu
@@ -19,9 +19,9 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XTensor.h"
+#include "../../XTensor.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceMax.h"
 #include "ReduceMax.cuh"
@@ -31,14 +31,10 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = max_{0<=j<strideNum} input_{i,j}
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -89,82 +85,77 @@ void KernelReduceMax(DTYPE * input, DTYPE * output,
 }
- /*
+/*
- reduce a tensor to another that keeps the max value along a dimension  - slow version
+reduce a tensor to another that keeps the max value along a dimension  - slow version
+Given a block of data, we go over each dimension i in the stride and we have
- Given a block of data, we go over each dimension i in the stride and we have
+sum_i = max_{0<=j<strideNum} input_{i,j}
+where we can view the block as a matrix and input_{i,j} represent the item at the
- sum_i = max_{0<=j<strideNum} input_{i,j}
+crossing of the i-th columne and the j-th row.
+>> input - the input array (representing a tensor)
- where we can view the block as a matrix and input_{i,j} represent the item at the
+>> output - the sum over each block. NOTE: output is also an array
- crossing of the i-th columne and the j-th row.
+>> stride - stride that we need to move to the next item
+>> strideNum - how many strides we need to finish the reduce
- >> input - the input array (representing a tensor)
+>> reducedStrideNum - the number of strides after reducation
- >> output - the sum over each block. NOTE: output is also an array
+>> blockSize - size of the block (i.e., stride * strideNum)
- >> stride - stride that we need to move to the next item
+>> blockNum - how many blocks
- >> strideNum - how many strides we need to finish the reduce
+*/
- >> reducedStrideNum - the number of strides after reducation
+__global__
- >> blockSize - size of the block (i.e., stride * strideNum)
+void KernelReduceMax(__half * input, __half * output,
- >> blockNum - how many blocks
+        int stride, int strideNum, int reducedStrideNum,
- */
+        int blockSize, int blockNum)
- __global__
+{
- void KernelReduceMax(__half * input, __half * output,
+    int idx = threadIdx.x * blockDim.y + threadIdx.y;
-         int stride, int strideNum, int reducedStrideNum,
+    unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-         int blockSize, int blockNum)
+    unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
- {
-     int idx = threadIdx.x * blockDim.y + threadIdx.y;
-     unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
-     unsigned int j = blockIdx.y*blockDim.y + threadIdx.y;
-     if (i >= stride * blockNum)
+    if (i >= stride * blockNum)
-         return;
+        return;
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
+    __shared__ __half iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
 #else
-     __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
+    __shared__ DTYPE iData[MAX_CUDA_THREAD_NUM_PER_BLOCK * MIN_CUDA_SHARED_MEM_COL_SIZE / 2];
 #endif
-     __syncthreads();
+    __syncthreads();
-     int k = i / stride;
+    int k = i / stride;
-     int iOffset = i % stride;
+    int iOffset = i % stride;
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     __half value = (i < stride * blockNum && j < strideNum) ?
+    __half value = (i < stride * blockNum && j < strideNum) ?
         input[blockSize * k + stride * j + iOffset] : __half(FLOAT16_MIN);
 #else
-     DTYPE value = (i < stride * blockNum && j < strideNum) ?
+    DTYPE value = (i < stride * blockNum && j < strideNum) ?
-         __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
+        __half2float(input[blockSize * k + stride * j + iOffset]) : FLOAT_MIN;
 #endif
-     /* load data into the shared mem */
+    /* load data into the shared mem */
-     iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
+    iData[threadIdx.x * blockDim.y + threadIdx.y] = value;
-     __syncthreads();
+    __syncthreads();
-     /* do reduction in shared mem */
+    /* do reduction in shared mem */
-     for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
+    for (unsigned int s = blockDim.y / 2; s > 0; s >>= 1) {
-         if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
+        if (threadIdx.y < s && iData[idx] < iData[idx + s]) {
-             iData[idx] = iData[idx + s];
+            iData[idx] = iData[idx + s];
-         }
+        }
-         __syncthreads();
+        __syncthreads();
-     }
+    }
 #if __CUDA_ARCH__ >= 530 || !defined(__CUDA_ARCH__)
-     /* write result for this block to the output array */
+    /* write result for this block to the output array */
-     if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
-         output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = iData[threadIdx.x * blockDim.y];
 #else
-     /* write result for this block to the output array */
+    /* write result for this block to the output array */
-     if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
+    if (threadIdx.y == 0 && blockIdx.y < reducedStrideNum)
-         output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
+        output[(k * reducedStrideNum + blockIdx.y) * stride + iOffset] = __half(iData[threadIdx.x * blockDim.y]);
 #endif
 }
 /* 
 reduce a tensor to another that keeps the max value along a dimension  - fast version
 >> input - the input array (representing a tensor)
@@ -338,9 +329,7 @@ void KernelReduceMaxSimpleFast(DTYPE * input, DTYPE * output,
 /* 
 get the max-valued items along a dimension of the tensor (cuda version). 
 For a 1-dimensional data array a,
 sum_i = max_{0<=j<strideNum} input_{i,j}
 >> input - the input tensor
 >> output - the output tensor
 >> dim - which dimension to reduce

--- a/source/core/ReduceMax.cuh
+++ b/source/core/ReduceMax.cuh
--- a/source/core/ReduceMax.h
+++ b/source/core/ReduceMax.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMAX_H__
 #define __REDUCEMAX_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceMean.cpp
+++ b/source/core/ReduceMean.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceMean.h"
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 get the mean value along a dimension of the tensor. For a 1-dimensional data array a,
 mean = (1/n) * sum_i input_i
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on
@@ -44,5 +43,4 @@ void ReduceMean(XTensor * input, XTensor * output, int dim)
    ScaleAndShift(output, (DTYPE)1/num, 0);
 }
 } // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
--- a/source/core/ReduceMean.h
+++ b/source/core/ReduceMean.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEMEAN_H__
 #define __REDUCEMEAN_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceStandardVariance.h
+++ b/source/core/ReduceStandardVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESTANDARDVARIANCE_H__
 #define __REDUCESTANDARDVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts { // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cpp
+++ b/source/core/ReduceSum.cpp
@@ -22,7 +22,7 @@
 #include <math.h>
 #include "ReduceSum.h"
 #include "ReduceSum.cuh"
-#include "../XName.h"
+#include "../../XName.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSum.cu
+++ b/source/core/ReduceSum.cu
@@ -19,8 +19,8 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XDevice.h"
+#include "../../XDevice.h"
-#include "../XUtility.h"
+#include "../../XUtility.h"
 #include "ReduceSum.cuh"
 namespace nts{ // namespace nts(NiuTrans.Tensor)
@@ -29,13 +29,11 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -107,13 +105,11 @@ void KernelReduceSum(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - slow version
 This is for float16 reduction.
 Given a block of data, we go over each dimension i in the stride and we have
 sum_i = sum_{0<=j<strideNum} exp(input_{i,j} - shift) if isExp == true;
      = sum_{0<=j<strideNum} input_{i,j} - shift if isExp == false;
 where we can view the block as a matrix and input_{i,j} represent the item at the
 crossing of the i-th columne and the j-th row.
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item
@@ -304,7 +300,6 @@ void KernelReduceSumFast(DTYPE * input, DTYPE * output,
 /* 
 reduce a tensor to another that keeps the sum along a dimension  - fast version
 This is for float16 reduction
 >> input - the input array (representing a tensor)
 >> output - the sum over each block. NOTE: output is also an array
 >> stride - stride that we need to move to the next item

--- a/source/core/ReduceSum.cuh
+++ b/source/core/ReduceSum.cuh
--- a/source/core/ReduceSum.h
+++ b/source/core/ReduceSum.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUM_H__
 #define __REDUCESUM_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceSumSquared.cpp
+++ b/source/core/ReduceSumSquared.cpp
@@ -28,7 +28,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 squared sum of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 sum = \sum_i (a_i - shift)^2
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ReduceSumSquared.h
+++ b/source/core/ReduceSumSquared.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCESUMSQUARED_H__
 #define __REDUCESUMSQUARED_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/ReduceVariance.cpp
+++ b/source/core/ReduceVariance.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "ScaleAndShift.h"
+#include "../math/ScaleAndShift.h"
 #include "ReduceSum.h"
 #include "ReduceVariance.h"
@@ -29,7 +29,6 @@ namespace nts{ // namespace nts(NiuTrans.Tensor)
 variance of the items along a dimension of the tensor. 
 For a 1-dimensional data array a,
 variance = 1/n * \sum_i (a_i - mean)^2
 >> input - the input tensor
 >> output - the output tensor
 >> dim - the dimension where the reduction is performed on

--- a/source/core/ReduceVariance.h
+++ b/source/core/ReduceVariance.h
@@ -22,7 +22,7 @@
 #ifndef __REDUCEVARIANCE_H__
 #define __REDUCEVARIANCE_H__
-#include "../XTensor.h"
+#include "../../XTensor.h"
 namespace nts{ // namespace nts(NiuTrans.Tensor)

--- a/source/core/Concatenate.cpp
+++ b/source/core/Concatenate.cpp
@@ -19,7 +19,7 @@
 * $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-04-24
 */
-#include "../XTensor.h"
+#include "../../XTensor.h"
 #include "Concatenate.h"
 #include "Merge.h"
 #include "ConcatenateSolely.h"
@@ -53,6 +53,10 @@ void Concatenate(XList * smalls, XTensor * big, int dim)
 /*
 concatenate two tensors along a given dimension
+>> smallA - one tensor for concatenation
+>> smallB - the other tensor for concatenation
+>> big - the resulting tensor
+>> dim - which dimension we perform the concatenation
 */
 void Concatenate(XTensor * smallA, XTensor * smallB, XTensor * big, int dim)
 {

--- a/source/core/Concatenate.h
+++ b/source/core/Concatenate.h
--- a/source/core/ConcatenateSolely.cpp
+++ b/source/core/ConcatenateSolely.cpp
--- a/source/core/ConcatenateSolely.h
+++ b/source/core/ConcatenateSolely.h
--- a/source/core/MakeMergeBlockIndex.cpp
+++ b/source/core/MakeMergeBlockIndex.cpp
--- a/source/core/MakeMergeBlockIndex.cu
+++ b/source/core/MakeMergeBlockIndex.cu
--- a/source/core/MakeMergeBlockIndex.cuh
+++ b/source/core/MakeMergeBlockIndex.cuh
--- a/source/core/MakeMergeBlockIndex.h
+++ b/source/core/MakeMergeBlockIndex.h
--- a/source/core/MakeSplitBlockIndex.cpp
+++ b/source/core/MakeSplitBlockIndex.cpp
--- a/source/core/MakeSplitBlockIndex.cu
+++ b/source/core/MakeSplitBlockIndex.cu
--- a/source/core/MakeSplitBlockIndex.cuh
+++ b/source/core/MakeSplitBlockIndex.cuh
--- a/source/core/MakeSplitBlockIndex.h
+++ b/source/core/MakeSplitBlockIndex.h
--- a/source/core/Merge.cpp
+++ b/source/core/Merge.cpp
--- a/source/core/Merge.h
+++ b/source/core/Merge.h
--- a/source/core/MergeBlockLists.cpp
+++ b/source/core/MergeBlockLists.cpp
--- a/source/core/MergeBlockLists.cu
+++ b/source/core/MergeBlockLists.cu
--- a/source/core/MergeBlockLists.cuh
+++ b/source/core/MergeBlockLists.cuh
--- a/source/core/MergeBlockLists.h
+++ b/source/core/MergeBlockLists.h
--- a/source/core/Permute.cpp
+++ b/source/core/Permute.cpp
--- a/source/core/Permute.h
+++ b/source/core/Permute.h
--- a/source/core/Split.cpp
+++ b/source/core/Split.cpp
--- a/source/core/Split.h
+++ b/source/core/Split.h
--- a/source/core/Transpose.cpp
+++ b/source/core/Transpose.cpp
--- a/source/core/Transpose.h
+++ b/source/core/Transpose.h
--- a/source/core/Unsqueeze.cpp
+++ b/source/core/Unsqueeze.cpp
--- a/source/core/Unsqueeze.cu
+++ b/source/core/Unsqueeze.cu
--- a/source/core/Unsqueeze.cuh
+++ b/source/core/Unsqueeze.cuh
--- a/source/core/Unsqueeze.h
+++ b/source/core/Unsqueeze.h
--- a/source/core/Sort.cpp
+++ b/source/core/Sort.cpp
--- a/source/core/Sort.cu
+++ b/source/core/Sort.cu
--- a/source/core/Sort.cuh
+++ b/source/core/Sort.cuh
--- a/source/core/Sort.h
+++ b/source/core/Sort.h
--- a/source/core/TopK.cpp
+++ b/source/core/TopK.cpp
--- a/source/core/TopK.cu
+++ b/source/core/TopK.cu
--- a/source/core/TopK.cuh
+++ b/source/core/TopK.cuh
--- a/source/core/TopK.h
+++ b/source/core/TopK.h
--- a/source/core/FlushToMem.cpp
+++ b/source/core/FlushToMem.cpp
--- a/source/core/FlushToMem.cu
+++ b/source/core/FlushToMem.cu
--- a/source/core/FlushToMem.cuh
+++ b/source/core/FlushToMem.cuh
--- a/source/core/FlushToMem.h
+++ b/source/core/FlushToMem.h
--- a/source/core/SetAscendingOrder.cu
+++ b/source/core/SetAscendingOrder.cu
--- a/source/core/SetAscendingOrder.cuh
+++ b/source/core/SetAscendingOrder.cuh
--- a/source/core/XMatrixSegment.cpp
+++ b/source/core/XMatrixSegment.cpp
--- a/source/core/XMatrixSegment.h
+++ b/source/core/XMatrixSegment.h
--- a/source/function/HardTanH.cpp
+++ b/source/function/HardTanH.cpp
--- a/source/function/HardTanH.cu
+++ b/source/function/HardTanH.cu
--- a/source/function/Identity.cpp
+++ b/source/function/Identity.cpp
--- a/source/function/LogSoftmax.cpp
+++ b/source/function/LogSoftmax.cpp
--- a/source/function/LogSoftmax.cu
+++ b/source/function/LogSoftmax.cu
--- a/source/function/Loss.cpp
+++ b/source/function/Loss.cpp
--- a/source/function/Loss.cu
+++ b/source/function/Loss.cu
--- a/source/function/Rectify.cu
+++ b/source/function/Rectify.cu
--- a/source/function/Sigmoid.cpp
+++ b/source/function/Sigmoid.cpp
--- a/source/function/Sigmoid.cu
+++ b/source/function/Sigmoid.cu
--- a/source/function/Softmax.cpp
+++ b/source/function/Softmax.cpp
--- a/source/function/Softmax.cu
+++ b/source/function/Softmax.cu
--- a/source/function/Softmax.cuh
+++ b/source/function/Softmax.cuh
--- a/source/test/TConcatenate.cpp
+++ b/source/test/TConcatenate.cpp
--- a/source/test/TConcatenate.h
+++ b/source/test/TConcatenate.h
--- a/source/test/TConcatenateSolely.cpp
+++ b/source/test/TConcatenateSolely.cpp
--- a/source/test/TConcatenateSolely.h
+++ b/source/test/TConcatenateSolely.h
--- a/source/test/TCopyIndexed.cpp
+++ b/source/test/TCopyIndexed.cpp
--- a/source/test/TCopyIndexed.h
+++ b/source/test/TCopyIndexed.h
--- a/source/test/TCopyValues.cpp
+++ b/source/test/TCopyValues.cpp
--- a/source/test/TCopyValues.h
+++ b/source/test/TCopyValues.h
--- a/source/test/THardTanH.cpp
+++ b/source/test/THardTanH.cpp
--- a/source/test/TIdentity.cpp
+++ b/source/test/TIdentity.cpp
--- a/source/test/TIdentity.h
+++ b/source/test/TIdentity.h
--- a/source/test/TLogSoftmax.cpp
+++ b/source/test/TLogSoftmax.cpp
--- a/source/test/TLogSoftmax.h
+++ b/source/test/TLogSoftmax.h
--- a/source/test/TLoss.cpp
+++ b/source/test/TLoss.cpp
--- a/source/test/TLoss.h
+++ b/source/test/TLoss.h
--- a/source/test/TMatrixMULBatchedCPU.cpp
+++ b/source/test/TMatrixMULBatchedCPU.cpp
--- a/source/test/TMatrixMULBatchedCPU.h
+++ b/source/test/TMatrixMULBatchedCPU.h
--- a/source/test/TMatrixMul.cpp
+++ b/source/test/TMatrixMul.cpp
--- a/source/test/TMatrixMul.h
+++ b/source/test/TMatrixMul.h
--- a/source/test/TMatrixMul2D.cpp
+++ b/source/test/TMatrixMul2D.cpp
--- a/source/test/TMatrixMul2D.h
+++ b/source/test/TMatrixMul2D.h
--- a/source/test/TMatrixMul2DParallel.cpp
+++ b/source/test/TMatrixMul2DParallel.cpp
--- a/source/test/TMatrixMul2DParallel.h
+++ b/source/test/TMatrixMul2DParallel.h
--- a/source/test/TMatrixMulBatched.cpp
+++ b/source/test/TMatrixMulBatched.cpp
--- a/source/test/TMatrixMulBatched.h
+++ b/source/test/TMatrixMulBatched.h
--- a/source/test/TMerge.cpp
+++ b/source/test/TMerge.cpp
--- a/source/test/TMerge.h
+++ b/source/test/TMerge.h
--- a/source/test/TMultiply.cpp
+++ b/source/test/TMultiply.cpp
--- a/source/test/TMultiply.h
+++ b/source/test/TMultiply.h
--- a/source/test/TNegate.cpp
+++ b/source/test/TNegate.cpp
--- a/source/test/TNegate.h
+++ b/source/test/TNegate.h
--- a/source/test/TNormalize.cpp
+++ b/source/test/TNormalize.cpp
--- a/source/test/TNormalize.h
+++ b/source/test/TNormalize.h
--- a/source/test/TPower.cpp
+++ b/source/test/TPower.cpp
--- a/source/test/TPower.h
+++ b/source/test/TPower.h
--- a/source/test/TRectify.cpp
+++ b/source/test/TRectify.cpp
--- a/source/test/TReduceMax.cpp
+++ b/source/test/TReduceMax.cpp
--- a/source/test/TReduceMax.h
+++ b/source/test/TReduceMax.h
--- a/source/test/TReduceMean.cpp
+++ b/source/test/TReduceMean.cpp
--- a/source/test/TReduceMean.h
+++ b/source/test/TReduceMean.h
--- a/source/test/TReduceSum.cpp
+++ b/source/test/TReduceSum.cpp
--- a/source/test/TReduceSum.h
+++ b/source/test/TReduceSum.h
--- a/source/test/TReduceSumSquared.cpp
+++ b/source/test/TReduceSumSquared.cpp
--- a/source/test/TReduceSumSquared.h
+++ b/source/test/TReduceSumSquared.h
--- a/source/test/TReduceVariance.cpp
+++ b/source/test/TReduceVariance.cpp
--- a/source/test/TReduceVariance.h
+++ b/source/test/TReduceVariance.h
--- a/source/test/TScaleAndShift.cpp
+++ b/source/test/TScaleAndShift.cpp
--- a/source/test/TScaleAndShift.h
+++ b/source/test/TScaleAndShift.h
--- a/source/test/TSelect.cpp
+++ b/source/test/TSelect.cpp
--- a/source/test/TSelect.h
+++ b/source/test/TSelect.h
--- a/source/test/TSetAscendingOrder.cpp
+++ b/source/test/TSetAscendingOrder.cpp
--- a/source/test/TSetAscendingOrder.h
+++ b/source/test/TSetAscendingOrder.h
--- a/source/test/TSetData.cpp
+++ b/source/test/TSetData.cpp
--- a/source/test/TSetData.h
+++ b/source/test/TSetData.h
--- a/source/test/TSigmoid.cpp
+++ b/source/test/TSigmoid.cpp
--- a/source/test/TSoftmax.cpp
+++ b/source/test/TSoftmax.cpp
--- a/source/test/TSort.cpp
+++ b/source/test/TSort.cpp
--- a/source/test/TSort.h
+++ b/source/test/TSort.h
--- a/source/test/TSplit.cpp
+++ b/source/test/TSplit.cpp
--- a/source/test/TSplit.h
+++ b/source/test/TSplit.h
--- a/source/test/TSum.cpp
+++ b/source/test/TSum.cpp
--- a/source/test/TSum.h
+++ b/source/test/TSum.h
--- a/source/test/TSumByColumnTV.cpp
+++ b/source/test/TSumByColumnTV.cpp
--- a/source/test/TSumByColumnTV.h
+++ b/source/test/TSumByColumnTV.h
--- a/source/test/TSumByColumnVT.cpp
+++ b/source/test/TSumByColumnVT.cpp
--- a/source/test/TSumByColumnVT.h
+++ b/source/test/TSumByColumnVT.h
--- a/source/test/TTopK.cpp
+++ b/source/test/TTopK.cpp
--- a/source/test/TTopK.h
+++ b/source/test/TTopK.h
--- a/source/test/TUnsqueeze.cpp
+++ b/source/test/TUnsqueeze.cpp
--- a/source/test/TUnsqueeze.h
+++ b/source/test/TUnsqueeze.h
--- a/source/test/TXMem.cpp
+++ b/source/test/TXMem.cpp
--- a/source/test/TXMem.h
+++ b/source/test/TXMem.h
--- a/source/test/Test.cpp
+++ b/source/test/Test.cpp
--- a/source/test/Test.h
+++ b/source/test/Test.h