Commit dd7a67bc by xiaotong

remove XStream

parent 158ccc4a
......@@ -42,7 +42,6 @@ XDevManager GDevs;
/* constructor */
XDevice::XDevice()
{
stream = NULL;
isInitialized = false;
Clear();
......@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
}
else
sprintf(name2, "GPU-%d %s", devID, name);
stream = new XStream(0, devID);
#endif
}
......@@ -176,10 +173,6 @@ void XDevice::Clear()
curandDestroyGenerator(gen);
isGenReady = false;
}
if (stream != NULL) {
delete stream;
stream = NULL;
}
#endif
isInitialized = false;
}
......@@ -227,17 +220,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
return &cublasHandle;
}
/* get the stream of cuda */
cudaStream_t * XDevice::GetCudaStream()
{
if (!isInitialized)
Init(devID);
CheckNTErrors(stream != NULL, "the stream is not initialized!");
return &stream->stream;
}
#endif // USE_CUDA
/* switch to a device */
......@@ -312,13 +294,6 @@ void XDevice::SetFastFlagsAllDevices()
#endif
}
/* delete the default stream for the device */
void XDevice::DelDeviceStream()
{
if(stream != NULL)
delete stream;
}
/* constructor */
XDevManager::XDevManager()
{
......@@ -391,14 +366,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return GPUs[devID].GetCublasHandle();
}
/* get the stream of a given GPU */
cudaStream_t * XDevManager::GetCudaStream(const int devID)
{
CheckNTErrors(devID < nGPU, "index of GPU is out of range.");
return GPUs[devID].GetCudaStream();
}
#endif
/*
......@@ -620,16 +587,5 @@ char * XDevManager::GetDevString(int devID)
}
}
/* delete the streams for all devices */
void XDevManager::DelDeviceStream()
{
for(int i = 0; i < GDevs.nCPU; i++) {
GDevs.CPUs[i].DelDeviceStream();
}
for(int i = 0; i < GDevs.nGPU; i++) {
GDevs.GPUs[i].DelDeviceStream();
}
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -25,7 +25,6 @@
#define __XDEVICE_H__
#include "XThread.h"
#include "XStream.h"
#ifdef USE_CUDA
......@@ -97,9 +96,6 @@ public:
/* specify whether Unified Virtual Address Space (UVA) is supported */
bool isUVASupported;
/* default stream for the device */
XStream * stream;
/* seed for random number generation */
int seed;
......@@ -140,9 +136,6 @@ public:
#ifdef USE_CUDA
/* get cublas handle */
cublasHandle_t * GetCublasHandle();
/* get the stream of cuda */
cudaStream_t * GetCudaStream();
#endif
/* switch to a device */
......@@ -164,9 +157,6 @@ public:
/* reset cuda flag for more efficient cuda execution (all devices) */
static
void SetFastFlagsAllDevices();
/* delete the default stream for the device (call it before deleting the XDevice) */
void DelDeviceStream();
};
/*
......@@ -206,9 +196,6 @@ public:
#ifdef USE_CUDA
/* get the handle of GPU */
cublasHandle_t * GetCudaHandle(const int devID);
/* get the stream of cuda */
cudaStream_t * GetCudaStream(const int devID);
#endif
/* get grid and block sizes that max potential */
......@@ -228,10 +215,6 @@ public:
/* get the device information in string */
char * GetDevString(int devID);
/* delete the streams for all devices */
static
void DelDeviceStream();
};
/* managing the devices */
......
......@@ -70,9 +70,6 @@ XQueue::XQueue(int mySize)
jobDequeuerArgs = new TensorList(1);
jobDequeuerBreak = false;
runningJobCount = 0;
jobStream = NULL;
jobStream1 = NULL;
jobStream2 = NULL;
MUTEX_INIT(enqueueMutex);
MUTEX_INIT(dequeueMutex);
......@@ -85,9 +82,6 @@ XQueue::~XQueue()
{
delete[] queue;
delete jobDequeuerArgs;
delete jobStream;
delete jobStream1;
delete jobStream2;
//if(isJobQueue)
// StopJobConsumer();
......@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
while(runningJobCount > 0){
XSleep(10);
}
if(jobStream != NULL){
CheckNTErrors((jobStream->IsFinished()), "None fineished jobs remain");
jobStream->Clear();
}
if(jobStream1 != NULL){
CheckNTErrors((jobStream1->IsFinished()), "None fineished jobs remain");
jobStream1->Clear();
}
if(jobStream2 != NULL){
CheckNTErrors((jobStream2->IsFinished()), "None fineished jobs remain");
jobStream2->Clear();
}
}
int devids[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
......@@ -268,31 +249,4 @@ bool XQueue::GetJobBreak()
return jobDequeuerBreak;
}
/* get job stream */
XStream * XQueue::GetJobStream(int n)
{
if(n == 0)
return jobStream;
else if(n == 1)
return jobStream1;
else if(n == 2)
return jobStream2;
else{
ShowNTErrors("invalid stream id!");
}
return NULL;
}
/* make job streams */
void XQueue::MakeJobStreams(int devID, int devID1, int devID2)
{
if(devID != INVALID_DEVICE_ID)
jobStream = new XStream(0, devID);
if(devID1 != INVALID_DEVICE_ID)
jobStream1 = new XStream(0, devID1);
if(devID2 != INVALID_DEVICE_ID)
jobStream2 = new XStream(0, devID2);
}
} /* end of the nts (NiuTrans.Tensor) namespace */
......@@ -33,7 +33,6 @@
#include "XGlobal.h"
#include "XThread.h"
#include "XStream.h"
#include "XDevice.h"
#include "XList.h"
......@@ -110,11 +109,6 @@ private:
/* running job count */
int runningJobCount;
/* job streams (we think that three streams is enough :)) */
XStream * jobStream;
XStream * jobStream1;
XStream * jobStream2;
public:
/* constuctor */
XQueue(int mySize = MAX_QUEUE_SIZE);
......@@ -149,12 +143,6 @@ public:
/* get the break flag */
bool GetJobBreak();
/* get job stream */
XStream * GetJobStream(int n = 0);
/* make job streams */
void MakeJobStreams(int devID = INVALID_DEVICE_ID, int devID1 = INVALID_DEVICE_ID, int devID2 = INVALID_DEVICE_ID);
};
} /* end of the nts (NiuTrans.Tensor) namespace */
......
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include "stdio.h"
#include "stdlib.h"
#include "XGlobal.h"
#include "XStream.h"
#include "XDevice.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asynchronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
/* constructor */
XStream::XStream(int priority, int myDevID, int myMaxEventNum)
{
devID = myDevID;
#ifdef USE_CUDA
if(myDevID >= 0){
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
events = new cudaEvent_t[myMaxEventNum];
XDevice::SetGPUDevice(backupDevID);
maxEventNum = myMaxEventNum;
usedEventNum = 0;
}
else{
maxEventNum = 0;
usedEventNum = 0;
}
#endif
Create(priority, devID);
}
/* deconstructor */
XStream::~XStream()
{
Destroy();
#ifdef USE_CUDA
delete[] events;
#endif
}
/* create the stream */
void XStream::Create(int priority, int myDevID)
{
if(myDevID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(myDevID);
//cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
CheckNTErrors((cudaStreamCreate(&stream) == cudaSuccess),
"cannot create the cuda stream!");
XDevice::SetGPUDevice(backupDevID);
#endif
devID = myDevID;
}
/* destroy the stream */
void XStream::Destroy()
{
if(devID < 0)
return;
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
cudaStreamDestroy(stream);
XDevice::SetGPUDevice(backupDevID);
Clear();
#endif
}
/* clear it */
void XStream::Clear()
{
#ifdef USE_CUDA
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
for(int i = 0; i < usedEventNum; i++){
cudaEventDestroy(events[i]);
}
usedEventNum = 0;
XDevice::SetGPUDevice(backupDevID);
#endif
}
/* judge if all the jobs in the stream have been finished */
bool XStream::IsFinished()
{
#ifdef USE_CUDA
if(cudaStreamQuery(stream) == cudaSuccess)
return true;
else
return false;
#else
return true;
#endif
}
void XStream::StreamSynchronize()
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaStreamSynchronize(stream);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
void XStream::ThreadSynchronize()
{
#ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize();
#else
ShowNTErrors("TODO!");
#endif
#endif
}
void XStream::DeviceSynchronize(int devID)
{
#ifdef USE_CUDA
int devIDBackup = XDevice::GetGPUDevice();
cudaGetDevice(&devIDBackup);
if(devID != devIDBackup)
XDevice::SetGPUDevice(devID);
cudaDeviceSynchronize();
if(devID != devIDBackup)
XDevice::SetGPUDevice(devIDBackup);
#endif
}
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void XStream::MakeDependency(XStream * precedingStream)
{
#ifdef USE_CUDA
cudaEvent_t * e = precedingStream->MakeEvent();
cudaEventRecord(*e, precedingStream->stream);
cudaStreamWaitEvent(stream, *e, 0);
#endif
}
/* get the stream */
#ifdef USE_CUDA
inline cudaStream_t * XStream::Get()
{
return &stream;
}
/* make a event */
inline cudaEvent_t * XStream::MakeEvent()
{
int backupDevID = XDevice::GetGPUDevice();
XDevice::SetGPUDevice(devID);
CheckNTErrors((usedEventNum < maxEventNum), "Too many events are required!");
cudaEvent_t * e = events + usedEventNum++;
cudaEventCreate(e);
XDevice::SetGPUDevice(backupDevID);
return e;
}
#endif
} /* end of the nts (NiuTrans.Tensor) namespace */
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XSTREAM_H__
#define __XSTREAM_H__
/* the CUDA stuff */
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace nts{
#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asychronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
class XStream
{
public:
#ifdef USE_CUDA
/* the cuda stream */
cudaStream_t stream;
/* list of cuda events for synchronize different streams */
cudaEvent_t * events;
/* max number of the events */
int maxEventNum;
/* number of used events */
int usedEventNum;
#else
/* virtual pointer */
void * stream;
#endif
/* device that holds the stream */
int devID;
public:
/* constructor */
XStream(int priority = 0, int devID = 0, int maxEventNum = MAX_CUDA_EVENT_NUM_IN_A_STREAM);
/* deconstructor */
~XStream();
/* create the stream */
void Create(int priority = 0, int devID = 0);
/* destroy the stream */
void Destroy();
/* clear it */
void Clear();
/* judge if all the jobs in the stream have been finished */
bool IsFinished();
/* stream synchronize */
void StreamSynchronize();
/* thread synchronize */
static
void ThreadSynchronize();
/* device synchronize */
static
void DeviceSynchronize(int devID);
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void MakeDependency(XStream * precedingStream);
#ifdef USE_CUDA
/* get the stream */
cudaStream_t * Get();
/* make a event */
cudaEvent_t * MakeEvent();
#endif
};
} /* end of the nts (NiuTrans.Tensor) namespace */
#endif
......@@ -772,10 +772,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
}
/*
a vector with all entries of 0
>> stream - stream for the job pipeline
a tensor with all entries of 0
*/
void XTensor::SetZeroAll(XStream* stream)
void XTensor::SetZeroAll()
{
if(data == NULL)
return;
......@@ -788,12 +787,7 @@ void XTensor::SetZeroAll(XStream* stream)
int devIDBackup = 0;
cudaGetDevice(&devIDBackup);
cudaSetDevice(devID);
if(stream == NULL)
cudaMemset(data, 0, size);
else
cudaMemsetAsync(data, 0, size, stream->stream);
cudaMemset(data, 0, size);
cudaSetDevice(devIDBackup);
#endif
}
......@@ -807,13 +801,8 @@ void XTensor::SetZeroAll(XStream* stream)
#ifdef USE_CUDA
int devIDBackup = 0;
cudaGetDevice(&devIDBackup);
cudaSetDevice(devID);
if(stream == NULL)
cudaMemset(data, 0, unitNum * unitSize);
else
cudaMemsetAsync(data, 0, unitNum * unitSize, stream->stream);
cudaSetDevice(devID);
cudaMemset(data, 0, unitNum * unitSize);
cudaSetDevice(devIDBackup);
#endif
}
......
......@@ -31,7 +31,6 @@
#include <math.h>
#include "XGlobal.h"
#include "XPRunner.h"
#include "XStream.h"
#include "XHeap.h"
#include "XList.h"
#include "XDataType.h"
......@@ -303,7 +302,7 @@ public:
MTYPE GetOffset3D(int d0, int d1, int d2) const;
/* a tensor with all entries of 0 */
void SetZeroAll(XStream * stream = NULL);
void SetZeroAll();
/* set the tensor with an data array */
void SetData(const void * d, int num, int beg = 0);
......
......@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
#endif
}
void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream)
{
if (t == s)
return;
if (devIDT < 0 && devIDS < 0) {
for(int i = 0; i < n; i++)
memcpy((char*)t + tPitch * i, (char*)s + sPitch * i, mSize);
return;
}
#ifdef USE_CUDA
else{
CheckNTErrors(stream != NULL, "No stream found!");
cudaStream_t &cstream = stream->stream;
if (devIDT >= 0 && devIDS < 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyHostToDevice, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy2D error (cudaMemcpyHostToDevice)");
}
}
else if (devIDT < 0 && devIDS >= 0) {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToHost, cstream);
if(error != cudaSuccess){
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToHost)");
}
}
else {
cudaError_t error = cudaMemcpy2DAsync(t, tPitch, s, sPitch, mSize, n, cudaMemcpyDeviceToDevice, cstream);
if (error != cudaSuccess) {
ShowNTErrors("cudaMemcpy error (cudaMemcpyDeviceToDevice)");
}
}
}
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
void * XMemAlloc(int devID, size_t size)
{
void * p = NULL;
......
......@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
extern void XMemSet(int devID, void * p, int value, size_t size);
extern void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size);
extern void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n);
extern void XMemCopy2DAsync(void * t, size_t tPitch, int devIDT, const void * s, size_t sPitch, int devIDS, size_t mSize, int n, XStream * stream);
extern void * XMemAlloc(int devID, size_t size);
extern void * XMemAllocOnDev(int devID, size_t size);
extern void XMemFree(int devID, void * p);
......
......@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
>> stream - the string for creating the job pipeline
*/
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta,
XPRunner * parallelRunner, XStream * stream)
XPRunner * parallelRunner)
{
CheckNTErrors((a && b && c), "Empty input tensors!");
CheckNTErrors((a->dataType == b->dataType), "Input tensors should have the same data type!");
......@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
#ifdef USE_CUDA
if (a->devID >= 0 || b->devID >= 0 || c->devID >= 0) {
_CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta, stream);
_CudaMatrixMul2D(a, transposedA, b, transposedB, c, alpha, beta);
return;
}
#endif
......
......@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
>> c - where we put a*b
>> alpha - a coefficient
>> beta - another coefficient
>> stream - the string for creating the job pipeline
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta, XStream * stream)
XTensor * c, DTYPE alpha, DTYPE beta)
{
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
......@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
if (beta == 0)
c->SetZeroAll();
......
......@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XStream * stream = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#endif // USE_CUDA
......
......@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL, XStream * stream = NULL);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0, XPRunner * parallelRunner = NULL);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -32,9 +32,8 @@ copy s to t
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
*/
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
void _CopyValues(const XTensor * s, XTensor * t)
{
if(s->data == NULL && t->data == NULL)
return;
......@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
#ifdef USE_CUDA
if (s->devID >= 0 || t->devID >= 0) {
_CudaCopyValues(s, t, stream);
_CudaCopyValues(s, t);
return;
}
#endif
......@@ -82,9 +81,8 @@ copy s to t
>> sLen - length of the segment
>> t - target
>> tBeg - beginning of the segment on the target side
>> stream - the stream for creating the job pipeline
*/
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream)
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg)
{
if(s->data == NULL && t->data == NULL)
return;
......@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
/*
copy s to t (rename _CopyValues)
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
>> s - source
>> t - target
*/
void CopyValues(const XTensor &s, XTensor &t, XStream * stream)
void CopyValues(const XTensor &s, XTensor &t)
{
_CopyValues(&s, &t, stream);
_CopyValues(&s, &t);
}
/*
......@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - source
>> stream - the stream for creating the job pipeline
<< return - the copyed tensor t
*/
XTensor CopyValues(const XTensor &s, XStream * stream)
XTensor CopyValues(const XTensor &s)
{
XTensor t(&s);
t.SetTMPFlag();
/* call _CopyValues function */
_CopyValues(&s, &t, stream);
_CopyValues(&s, &t);
/* tensor connection */
if (s.enableGrad) {
......
......@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
copy a range of elements from a source vector to a target vector
>> s - source matrix
>> t - target matrix
>> stream - the stream for creating the job pipeline
<< return - succeed or not
*/
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
void _CudaCopyValues(const XTensor * s, XTensor * t)
{
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
......@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
/* dense -> dense */
if (!s->isSparse && !t->isSparse) {
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
}
/* dense -> sparse */
else if (!s->isSparse && t->isSparse &&
......@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
int num = s->unitNumNonZero;
int size = sizeof(int) + num * (s->unitSize + sizeof(int));
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, size);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, size);
t->unitNumNonZero = num;
}
else {
......
......@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */
void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
void _CudaCopyValues(const XTensor * s, XTensor * t);
#endif // USE_CUDA
......
......@@ -27,19 +27,19 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
/* copy s to t */
void _CopyValues(const XTensor * s, XTensor * t, XStream * stream = NULL);
void _CopyValues(const XTensor * s, XTensor * t);
/* copy a segment of s to t */
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg, XStream * stream = NULL);
void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t, const int tBeg);
/* copy s to t (rename _CopyValues) */
void CopyValues(const XTensor &s, XTensor &t, XStream * stream = NULL);
void CopyValues(const XTensor &s, XTensor &t);
/*
copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor CopyValues(const XTensor &s, XStream * stream = NULL);
XTensor CopyValues(const XTensor &s);
} // namespace nts(NiuTrans.Tensor)
......
......@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[t->devID].stream;
for (int k = 0; k < splitNum; k++) {
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)s->data + k * sStep, sPitch, s->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
else {
......@@ -321,27 +307,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
}
}
else{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream * stream = GDevs.GPUs[big->devID].stream;
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2DAsync((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n, stream);
}
stream->StreamSynchronize();
#else
for (int k = 0; k < splitNum; k++) {
XTensor * t = (XTensor*)smalls->GetItem(k);
XMemCopy2D((char*)t->data + k * tStep, tPitch, t->devID,
(char*)big->data + k * sStep, sPitch, big->devID,
mSize, n);
}
#endif
#else
ShowNTErrors("Please specify USE_CUDA and recompile the code!");
#endif
}
}
/* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论