Commit 3f23f074 by xiaotong

split with stream

parent 70e478c4
......@@ -32,9 +32,8 @@
using namespace nts;
using namespace samplefnnlm;
int main( int argc, const char ** argv )
{
{
if(argc > 1 && !strcmp(argv[1], "-test"))
1;//Test();
else if(argc > 1 && !strcmp(argv[1], "-fnnlm"))
......
......@@ -167,7 +167,9 @@ void XLink::SetType(int id)
type[0] = 0;
strcpy(type, GetOPName(id));
typeID = id;
CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
if(id != 0){
CheckNTErrors(strcmp(type, "NULL"), "illegal edge type name!");
}
}
/*
......@@ -515,7 +517,7 @@ void XLink::CopyIncoming(const XTensor * reference, XTensor * target)
tails.Add(tail);
}
MakeLink(&tails, target, reference->id);
MakeLink(&tails, target, reference->income.typeID);
int paraNum = reference->income.paramNum;
target->income.paramNum = paraNum;
......
......@@ -284,6 +284,44 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
#endif
}
void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
{
if (t == s)
return;
if (devIDT < 0 && devIDS < 0) {
for(int i = 0; i < n; i++)
memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
return;
}
#ifdef USE_CUDA
else{
CheckNTErrors(stream != NULL, "No stream found!");
cudaStream_t &cstream = stream->stream;
if (devIDT >= 0 && devIDS < 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
}
}
else if (devIDT < 0 && devIDS >= 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
}
}
else {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
if (error != cudaSuccess) {
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
}
}
}
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
void * XMemAlloc(int devID, size_t size)
{
void * p = NULL;
......
......@@ -23,6 +23,7 @@
#include <stdio.h>
#include "XGlobal.h"
#include "XDevice.h"
#ifndef __XUTILITY_H__
#define __XUTILITY_H__
......@@ -41,6 +42,7 @@ extern void XMemSet(void * p, int value, size_t size);
extern void XMemSet(int devID, void * p, int value, size_t size);
extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
extern void * XMemAlloc(int devID, size_t size);
extern void * XMemAllocOnDev(int devID, size_t size);
extern void XMemFree(int devID, void * p);
......
......@@ -31,13 +31,13 @@ set target data block index for the data movement in split
>> splitNum - number of splits
>> blockSplitSize - size of the splitted block
>> blockNum - number of data blocks
>> mem - the memory pool
>> devID - device id
*/
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem)
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID)
{
if (mem != NULL && mem->devID >= 0) {
if (devID >= 0) {
#ifdef USE_CUDA
_CudaMakeSplitBlockIndex(mem->devID, blockIndex, splitNum, blockSplitSize, blockNum);
_CudaMakeSplitBlockIndex(devID, blockIndex, splitNum, blockSplitSize, blockNum);
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
......
......@@ -27,7 +27,7 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* set target data block index for the data movement in split */
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, XMem * mem);
void _MakeSplitBlockIndex(int * blockIndex, int splitNum, int blockSplitSize, int blockNum, int devID);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -42,6 +42,8 @@ e.g., (N/3, M, 3) -> (N, M)
*/
void _Merge(const XTensor * s, XTensor * t, int whereToMerge, int leadingDim)
{
if(leadingDim < 0)
leadingDim = 0;
int whereToMergeRDI = s->order - whereToMerge - 1;
int leadingDimRDI = s->order - leadingDim - 1;
if (leadingDimRDI < 0)
......@@ -268,10 +270,10 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
}
/* merging with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
else {
int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
dimSizeTMP[i] = -smallsItem0->dimSizeRDI[i];
dimSizeTMP[smallsItem0->order] = -mergeNum;
int* dimSizeTMP = new int[smallsItem0->order + 1];
for (int i = 0; i < smallsItem0->order; i++)
dimSizeTMP[i + 1] = -smallsItem0->dimSize[i];
dimSizeTMP[0] = -mergeNum;
XMem * mem = smallsItem0->mem;
XTensor * tensorTMP = new XTensor(smallsItem0->order + 1, dimSizeTMP,
......@@ -283,7 +285,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
if (uniform)
dataTMP = smallsItem0->data;
else
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
tensorTMP->data = dataTMP;
......@@ -295,7 +297,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
}
}
_Merge(tensorTMP, big, whereToMerge);
_Merge(tensorTMP, big, whereToMerge + 1);
delete[] dimSizeTMP;
tensorTMP->data = NULL;
......@@ -306,7 +308,7 @@ void _Merge(const XList * smalls, XTensor * big, int whereToMerge)
if ((!uniform) && (mem != NULL))
mem->ReleaseBuf(mem->devID, size);
else
XMemFree(mem->devID, dataTMP);
XMemFree(big->devID, dataTMP);
}
}
......
......@@ -24,6 +24,7 @@
#include "MakeSplitBlockIndex.h"
#include "../../XName.h"
#include "../../XTensor.h"
#include "../../XDevice.h"
#include "../../XUtility.h"
#include "../movement/CopyBlocksOnSite.h"
......@@ -82,16 +83,40 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
CheckNTErrors((blockNum % splitNum == 0), "Incorrect split number!");
if (splitNum <= MIN_TENSOR_SPLIT_NUM) {
//if (splitNum <= 0) {
int sPitch = blockSize * splitNum * s->unitSize;
int tPitch = blockSize * t->unitSize;
int mSize = blockSize * t->unitSize;
int n = blockNum / splitNum;
int sStep = blockSize * s->unitSize;
int tStep = n * tPitch;
for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n);
if(t->devID < 0){
for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n);
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[t->devID].stream;
for (int k = 0; k < splitNum; k++) {
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
else {
......@@ -108,10 +133,10 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
int blockSplitSize = blockNum / splitNum;
int * blockIndex = (int*)(mem != NULL ?
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(mem->devID, blockNum * sizeof(int)));
mem->AllocBuf(mem->devID, blockNum * sizeof(int)) :
XMemAlloc(s->devID, blockNum * sizeof(int)));
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, mem);
_MakeSplitBlockIndex(blockIndex, splitNum, blockSplitSize, blockNum, s->devID);
_CopyBlocksOnSite(s->data, realBlockSize, blockNum, dataTMP, blockIndex, mem);
......@@ -226,20 +251,46 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
int n = blockNum / splitNum;
int sStep = blockSize * big->unitSize;
int tStep = 0;
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n);
if(big->devID < 0){
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n);
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[big->devID].stream;
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
/* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
else {
int* dimSizeTMP = new int[MAX_TENSOR_DIM_NUM];
for (int i = 0; i < MAX_TENSOR_DIM_NUM; i++)
dimSizeTMP[i] = -big->dimSize[i];
dimSizeTMP[whereToSplit] /= splitNum;
dimSizeTMP[big->order] = -splitNum;
int* dimSizeTMP = new int[big->order + 1];
for (int i = 0; i < big->order; i++)
dimSizeTMP[i + 1] = -big->dimSize[i];
dimSizeTMP[whereToSplit + 1] /= splitNum;
dimSizeTMP[0] = -splitNum;
XMem * mem = big->mem;
XTensor* tensorTMP = new XTensor(big->order + 1, dimSizeTMP, big->dataType, big->denseRatio, big->devID, mem);
......@@ -251,7 +302,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
dataTMP = first->data;
}
else {
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(mem->devID, size);
dataTMP = mem != NULL ? mem->AllocBuf(mem->devID, size) : XMemAlloc(big->devID, size);
}
tensorTMP->data = dataTMP;
......@@ -276,7 +327,7 @@ void _Split(const XTensor * big, XList * smalls, int whereToSplit, int splitNum)
if ((!uniform) && (mem != NULL))
mem->ReleaseBuf(mem->devID, size);
else
XMemFree(mem->devID, dataTMP);
XMemFree(big->devID, dataTMP);
}
}
......
......@@ -26,6 +26,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#define STREAMED_MEMCPOPY
/*
transform a tensor by splitting it
e.g., (M, N) -> (M, N/3, 3)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论