clean the code of Split

f7ed3448 · xiaotong · 15f75d3a · f7ed3448 · f7ed3448 · f7ed3448
Commit f7ed3448 authored Jul 28, 2018 by xiaotong
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -21,6 +21,7 @@

 #include <stdio.h>
 #include "XNet.h"
+#include "../tensor/XUtility.h"
 #include "../tensor/function/FHeader.h"
 #include "../tensor/core/CHeader.h"
 #include "../sample/fnnlm/FNNLM.h"

--- a/source/tensor/XTensor.h
+++ b/source/tensor/XTensor.h
@@ -47,10 +47,11 @@ struct XLink;
 /* define the maximum number of dimensions in a tensor */
 #define MAX_TENSOR_DIM_NUM 6
 #define USE_BATCHED_STRIDED_MAT_MUL
-#define MIN_TENSOR_SPLIT_NUM 10
+#define MIN_TENSOR_SPLIT_NUM 0
 #define MIN_TENSOR_SPLIT_LIST_NUM 1024
 #define MIN_TENSOR_CAT_NUM 8

+
 /* computation flags */
 #define UNSAFE_BUT_FAST_MEM
 #define FAST_MATRIX

--- a/source/tensor/core/movement/CopyBlocksOnSite.cu
+++ b/source/tensor/core/movement/CopyBlocksOnSite.cu
@@ -90,12 +90,12 @@ void _CudaCopyBlocks(void * source, int blockSize, int blockNum, void * target, 
    int bSize = blockSize / sizeof(DTYPE);

    if (bSize % 4 == 0) {
-        GDevs.GetCudaThread2D(myMem->devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        GDevs.GetCudaThread2D(devID, bSize / 4, blockNum, MAX_INT, cudaGrids, cudaBlocks);
        KernelCopyBlocks<4> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
    }
    else {
-        GDevs.GetCudaThread2D(myMem->devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
+        GDevs.GetCudaThread2D(devID, bSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
        KernelCopyBlocks<1> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >>>
                             ((DTYPE*)source, bSize, blockNum, (DTYPE*)target, targetBlocks);
    }

--- a/source/tensor/core/shape/MakeMergeBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.cpp
@@ -33,14 +33,14 @@ set target data block index for the data movement in merge
 >> splitSizeInGrid - size of each data array to merge
 >> gridSize - number of blocks in a grid (here grid is a higher level orgnization upon blocks)
 >> gridNum - number of grids
->> mem - the memory pool
+>> devID - device id
 */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem)
+                          int splitSizeInGrid, int gridSize, int gridNum, int devID)
 {
-    if (mem != NULL && mem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaMakeMergeBlockIndex(mem->devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
+        _CudaMakeMergeBlockIndex(devID, blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeMergeBlockIndex.h
+++ b/source/tensor/core/shape/MakeMergeBlockIndex.h
@@ -28,7 +28,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in merge */
 void _MakeMergeBlockIndex(int * blockIndex, int blockNum, int blockNumInMerge,
-                          int splitSizeInGrid, int gridSize, int gridNum, XMem * mem);
+                          int splitSizeInGrid, int gridSize, int gridNum, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -44,10 +44,10 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
    if(leadingDim < 0)
        leadingDim = 0;
-	int whereToMergeRDI = s->order - whereToMerge - 1;
-	int leadingDimRDI = s->order - leadingDim - 1;
+    int whereToMergeRDI = s->order - whereToMerge - 1;
+    int leadingDimRDI = s->order - leadingDim - 1;
    if (leadingDimRDI < 0)
-		leadingDimRDI = s->order - 1;
+        leadingDimRDI = s->order - 1;

    CheckNTErrors((s != NULL && t != NULL), "Invalid tensors!");
    CheckNTErrors((s->devID == t->devID || (s->devID < 0 && t->devID < 0)),
@@ -121,28 +121,24 @@ void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
        int realBlockSize = blockSize * t->unitSize;

        int * blockIndex = (int*)(mem != NULL ?
-            mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
-            XMemAlloc(mem->devID, blockNum * gridNum * sizeof(int)));
+                                  mem->AllocBuf(mem->devID, blockNum * gridNum * sizeof(int)) :
+                                  XMemAlloc(s->devID, blockNum * gridNum * sizeof(int)));

-        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, mem);
+        _MakeMergeBlockIndex(blockIndex, blockNum, blockNumInMerge, splitSizeInGrid, gridSize, gridNum, s->devID);

        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, s->devID);

        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * gridNum * sizeof(int));
        else
-            XMemFree(mem->devID, blockIndex);
-
-        /* copy from tmp to target */
-        XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
+            XMemFree(s->devID, blockIndex);

        if (!isOnSameDevice) {
            XMemCopy(t->data, t->devID, dataTMP, s->devID, size);
-
            if (mem != NULL)
                mem->ReleaseBuf(mem->devID, size);
            else
-                XMemFree(mem->devID, dataTMP);
+                XMemFree(s->devID, dataTMP);
        }
    }
 }
@@ -165,7 +161,7 @@ XTensor Merge(const XTensor &s, int whereToMerge, int leadingDim)
    CheckNTErrors(leadingDim < whereToMerge, "Invalid leading dimension!");
    
    if (leadingDim < 0)
-		leadingDim = 0;
+        leadingDim = 0;
    int order = s.order - 1;
    int * dimSize = new int[order];

@@ -207,7 +203,7 @@ merge small tensors into a big tensor
 */
 void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
 {
-	CheckNTErrors((smalls != NULL), "Invalid list!");
+    CheckNTErrors((smalls != NULL), "Invalid list!");
    CheckNTErrors((smalls->count > 0), "Empty list!");

    bool uniform = true;
@@ -235,7 +231,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    int mergedNum = smalls->count;

    XTensor * s0 = (XTensor*)smalls->GetItem(0);
-	int whereToMergeRDI = s0->order - whereToMerge - 1;
+    int whereToMergeRDI = s0->order - whereToMerge - 1;
    for (int i = 0; i < s0->order; i++) {
        if (i <= whereToMergeRDI)
            blockSize *= s0->dimSizeRDI[i];
@@ -300,9 +296,8 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        _Merge(tensorTMP, big, whereToMerge + 1);

        delete[] dimSizeTMP;
-        tensorTMP->data = NULL;
-        dataTMP = NULL;

+        tensorTMP->data = NULL;
        delete tensorTMP;

        if ((!uniform) && (mem != NULL))

--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -83,7 +83,6 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");

    if (splitNum <= MIN_TENSOR_SPLIT_NUM) {
-    //if (splitNum <= 0) {
        int sPitch = blockSize * splitNum * s->unitSize;
        int tPitch = blockSize * t->unitSize;
        int mSize = blockSize * t->unitSize;
@@ -143,7 +142,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        if (mem != NULL)
            mem->ReleaseBuf(mem->devID, blockNum * sizeof(int));
        else
-            XMemFree(mem->devID, blockIndex);
+            XMemFree(s->devID, blockIndex);

        /* copy from tmp to target */
        if (!isOnSameDevice) {
@@ -152,7 +151,7 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
            if (mem != NULL)
                mem->ReleaseBuf(mem->devID, size);
            else
-                XMemFree(mem->devID, dataTMP);
+                XMemFree(s->devID, dataTMP);
        }
    }
 }
@@ -321,7 +320,6 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        delete[] dimSizeTMP;

        tensorTMP->data = NULL;
-        dataTMP = NULL;
        delete tensorTMP;

        if ((!uniform) && (mem != NULL))