split with stream

3f23f074 · xiaotong · 70e478c4 · 3f23f074 · 3f23f074 · 3f23f074
Commit 3f23f074 authored Jul 27, 2018 by xiaotong
--- a/source/network/Main.cpp
+++ b/source/network/Main.cpp
@@ -32,9 +32,8 @@
 using namespace nts;
 using namespace samplefnnlm;

-
 int main( int argc, const char ** argv )
-{   
+{
    if(argc > 1 && !strcmp(argv[1], "-test"))
        1;//Test();
    else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))

--- a/source/tensor/XLink.cpp
+++ b/source/tensor/XLink.cpp
@@ -167,7 +167,9 @@ void XLink::SetType(int id)
    type[0] = 0;
    strcpy(type, GetOPName(id));
    typeID = id;
-    CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
+    if(id != 0){
+        CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
+    }
 }

 /* 
@@ -515,7 +517,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
        tails.Add(tail);
    }

-    MakeLink(&tails, target, reference->id);
+    MakeLink(&tails, target, reference->income.typeID);

    int paraNum = reference->income.paramNum;
    target->income.paramNum = paraNum;

--- a/source/tensor/XUtility.cpp
+++ b/source/tensor/XUtility.cpp
@@ -284,6 +284,44 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
 #endif
 }

+void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
+{
+    if (t == s)
+        return;
+
+    if (devIDT < 0 && devIDS < 0) {
+        for(int i = 0; i < n; i++)
+            memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
+        return;
+    }
+#ifdef USE_CUDA
+    else{
+        CheckNTErrors(stream != NULL, "No stream found!");
+        cudaStream_t &cstream = stream->stream;
+        if (devIDT >= 0 && devIDS < 0) {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
+            }
+        }
+        else if (devIDT < 0 && devIDS >= 0) {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
+            if(error != cudaSuccess){
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
+            }
+        }
+        else {
+            cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
+            if (error != cudaSuccess) {
+                ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
+            }
+        }
+    }
+#else
+    ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
+}
+
 void * XMemAlloc(int devID, size_t size)
 {
    void * p = NULL;

--- a/source/tensor/XUtility.h
+++ b/source/tensor/XUtility.h
@@ -23,6 +23,7 @@

 #include <stdio.h>
 #include "XGlobal.h"
+#include "XDevice.h"

 #ifndef __XUTILITY_H__
 #define __XUTILITY_H__
@@ -41,6 +42,7 @@ extern void XMemSet(void * p, int value, size_t size);
 extern void XMemSet(int devID, void * p, int value, size_t size);
 extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
 extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
+extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
 extern void * XMemAlloc(int devID, size_t size);
 extern void * XMemAllocOnDev(int devID, size_t size);
 extern void XMemFree(int devID, void * p);

--- a/source/tensor/core/shape/MakeSplitBlockIndex.cpp
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.cpp
@@ -31,13 +31,13 @@ set target data block index for the data movement in split
 >> splitNum - number of splits
 >> blockSplitSize - size of the splitted block
 >> blockNum - number of data blocks
->> mem - the memory pool
+>> devID - device id
 */
-void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem)
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID)
 {
-    if (mem != NULL && mem->devID >= 0) {
+    if (devID >= 0) {
 #ifdef USE_CUDA
-        _CudaMakeSplitBlockIndex(mem->devID, blockIndex, splitNum, blockSplitSize, blockNum);
+        _CudaMakeSplitBlockIndex(devID, blockIndex, splitNum, blockSplitSize, blockNum);
 #else
        ShowNTErrors("Please specify USE_CUDA and recompile the code!");
 #endif

--- a/source/tensor/core/shape/MakeSplitBlockIndex.h
+++ b/source/tensor/core/shape/MakeSplitBlockIndex.h
@@ -27,7 +27,7 @@
 namespace nts { // namespace nts(NiuTrans.Tensor)

 /* set target data block index for the data movement in split */
-void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
+void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID);

 } // namespace nts(NiuTrans.Tensor)


--- a/source/tensor/core/shape/Merge.cpp
+++ b/source/tensor/core/shape/Merge.cpp
@@ -42,6 +42,8 @@ e.g., (N/3, M, 3) -> (N, M)
 */
 void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
 {
+    if(leadingDim < 0)
+        leadingDim = 0;
 	int whereToMergeRDI = s->order - whereToMerge - 1;
 	int leadingDimRDI = s->order - leadingDim - 1;
    if (leadingDimRDI < 0)
@@ -268,10 +270,10 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
    }
    /* merging with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
    else {
-        int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
-        for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
-            dimSizeTMP[i] = -smallsItem0->dimSizeRDI[i];
-        dimSizeTMP[smallsItem0->order] = -mergeNum;
+        int* dimSizeTMP = new int[smallsItem0->order + 1];
+        for (int i = 0; i < smallsItem0->order; i++)
+            dimSizeTMP[i + 1] = -smallsItem0->dimSize[i];
+        dimSizeTMP[0] = -mergeNum;

        XMem * mem = smallsItem0->mem;
        XTensor * tensorTMP = new XTensor(smallsItem0->order + 1, dimSizeTMP,
@@ -283,7 +285,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        if (uniform)
            dataTMP = smallsItem0->data;
        else
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);

        tensorTMP->data = dataTMP;

@@ -295,7 +297,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
            }
        }

-        _Merge(tensorTMP, big, whereToMerge);
+        _Merge(tensorTMP, big, whereToMerge + 1);

        delete[] dimSizeTMP;
        tensorTMP->data = NULL;
@@ -306,7 +308,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
        if ((!uniform) && (mem != NULL))
            mem->ReleaseBuf(mem->devID, size);
        else
-            XMemFree(mem->devID, dataTMP);
+            XMemFree(big->devID, dataTMP);
    }
 }


--- a/source/tensor/core/shape/Split.cpp
+++ b/source/tensor/core/shape/Split.cpp
@@ -24,6 +24,7 @@
 #include "MakeSplitBlockIndex.h"
 #include "../../XName.h"
 #include "../../XTensor.h"
+#include "../../XDevice.h"
 #include "../../XUtility.h"
 #include "../movement/CopyBlocksOnSite.h"

@@ -82,16 +83,40 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
    CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");

    if (splitNum <= MIN_TENSOR_SPLIT_NUM) {
+    //if (splitNum <= 0) {
        int sPitch = blockSize * splitNum * s->unitSize;
        int tPitch = blockSize * t->unitSize;
        int mSize = blockSize * t->unitSize;
        int n = blockNum / splitNum;
        int sStep = blockSize * s->unitSize;
        int tStep = n * tPitch;
-        for (int k = 0; k < splitNum; k++) {
-            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                       (char*)s->data + k * sStep, sPitch, s->devID,
-                        mSize, n);
+        if(t->devID < 0){
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)s->data + k * sStep, sPitch, s->devID,
+                            mSize, n);
+            }
+        }
+        else{
+#ifdef USE_CUDA
+#ifdef STREAMED_MEMCPOPY
+            XStream * stream = GDevs.GPUs[t->devID].stream;
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
+                                (char*)s->data + k * sStep, sPitch, s->devID,
+                                 mSize, n, stream);
+            }
+            stream->StreamSynchronize();
+#else
+            for (int k = 0; k < splitNum; k++) {
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)s->data + k * sStep, sPitch, s->devID,
+                            mSize, n);
+            }
+#endif
+#else
+            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
        }
    }
    else {
@@ -108,10 +133,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
        int blockSplitSize = blockNum / splitNum;

        int * blockIndex = (int*)(mem != NULL ?
-            mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
-            XMemAlloc(mem->devID, blockNum * sizeof(int)));
+                                  mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
+                                  XMemAlloc(s->devID, blockNum * sizeof(int)));

-        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, mem);
+        _MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);

        _CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);

@@ -226,20 +251,46 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        int n = blockNum / splitNum;
        int sStep = blockSize * big->unitSize;
        int tStep = 0;
-        for (int k = 0; k < splitNum; k++) {
-            XTensor * t = (XTensor*)smalls->GetItem(k);
-            XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
-                       (char*)big->data + k * sStep, sPitch, big->devID,
-                        mSize, n);
+
+        if(big->devID < 0){
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)big->data + k * sStep, sPitch, big->devID,
+                            mSize, n);
+            }
+        }
+        else{
+#ifdef USE_CUDA
+#ifdef STREAMED_MEMCPOPY
+            XStream * stream = GDevs.GPUs[big->devID].stream;
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
+                                (char*)big->data + k * sStep, sPitch, big->devID,
+                                 mSize, n, stream);
+            }
+            stream->StreamSynchronize();
+#else
+            for (int k = 0; k < splitNum; k++) {
+                XTensor * t = (XTensor*)smalls->GetItem(k);
+                XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
+                           (char*)big->data + k * sStep, sPitch, big->devID,
+                            mSize, n);
+            }
+#endif
+#else
+            ShowNTErrors("Please specify USE_CUDA and recompile the code!");
+#endif
        }
    }
    /* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
    else {
-        int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
-        for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
-            dimSizeTMP[i] = -big->dimSize[i];
-        dimSizeTMP[whereToSplit] /= splitNum;
-        dimSizeTMP[big->order] = -splitNum;
+        int* dimSizeTMP = new int[big->order + 1];
+        for (int i = 0; i < big->order; i++)
+            dimSizeTMP[i + 1] = -big->dimSize[i];
+        dimSizeTMP[whereToSplit + 1] /= splitNum;
+        dimSizeTMP[0] = -splitNum;

        XMem * mem = big->mem;
        XTensor* tensorTMP = new XTensor(big->order + 1, dimSizeTMP, big->dataType, big->denseRatio, big->devID, mem);
@@ -251,7 +302,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
            dataTMP = first->data;
        }
        else {
-            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
+            dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
        }

        tensorTMP->data = dataTMP;
@@ -276,7 +327,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
        if ((!uniform) && (mem != NULL))
            mem->ReleaseBuf(mem->devID, size);
        else
-            XMemFree(mem->devID, dataTMP);
+            XMemFree(big->devID, dataTMP);
    }
 }


--- a/source/tensor/core/shape/Split.h
+++ b/source/tensor/core/shape/Split.h
@@ -26,6 +26,8 @@

 namespace nts { // namespace nts(NiuTrans.Tensor)

+#define STREAMED_MEMCPOPY
+
 /* 
 transform a tensor by splitting it 
 e.g., (M, N) -> (M, N/3, 3)