Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
8
Issues
8
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
NiuTrans
NiuTrans.Tensor
Commits
dd7a67bc
Commit
dd7a67bc
authored
Feb 23, 2021
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
remove XStream
parent
158ccc4a
隐藏空白字符变更
内嵌
并排
正在显示
19 个修改的文件
包含
29 行增加
和
564 行删除
+29
-564
source/tensor/XDevice.cpp
+0
-44
source/tensor/XDevice.h
+0
-17
source/tensor/XQueue.cpp
+0
-46
source/tensor/XQueue.h
+0
-12
source/tensor/XStream.cpp
+0
-203
source/tensor/XStream.h
+0
-116
source/tensor/XTensor.cpp
+5
-16
source/tensor/XTensor.h
+1
-2
source/tensor/XUtility.cpp
+0
-38
source/tensor/XUtility.h
+0
-1
source/tensor/core/arithmetic/MatrixMul2D.cpp
+2
-3
source/tensor/core/arithmetic/MatrixMul2D.cu
+1
-6
source/tensor/core/arithmetic/MatrixMul2D.cuh
+1
-1
source/tensor/core/arithmetic/MatrixMul2D.h
+1
-1
source/tensor/core/movement/CopyValues.cpp
+9
-13
source/tensor/core/movement/CopyValues.cu
+4
-11
source/tensor/core/movement/CopyValues.cuh
+1
-1
source/tensor/core/movement/CopyValues.h
+4
-4
source/tensor/core/shape/Split.cpp
+0
-29
没有找到文件。
source/tensor/XDevice.cpp
查看文件 @
dd7a67bc
...
...
@@ -42,7 +42,6 @@ XDevManager GDevs;
/* constructor */
XDevice
::
XDevice
()
{
stream
=
NULL
;
isInitialized
=
false
;
Clear
();
...
...
@@ -141,8 +140,6 @@ void XDevice::Init(int myDevID)
}
else
sprintf
(
name2
,
"GPU-%d %s"
,
devID
,
name
);
stream
=
new
XStream
(
0
,
devID
);
#endif
}
...
...
@@ -176,10 +173,6 @@ void XDevice::Clear()
curandDestroyGenerator
(
gen
);
isGenReady
=
false
;
}
if
(
stream
!=
NULL
)
{
delete
stream
;
stream
=
NULL
;
}
#endif
isInitialized
=
false
;
}
...
...
@@ -227,17 +220,6 @@ cublasHandle_t * XDevice::GetCublasHandle()
return
&
cublasHandle
;
}
/* get the stream of cuda */
cudaStream_t
*
XDevice
::
GetCudaStream
()
{
if
(
!
isInitialized
)
Init
(
devID
);
CheckNTErrors
(
stream
!=
NULL
,
"the stream is not initialized!"
);
return
&
stream
->
stream
;
}
#endif // USE_CUDA
/* switch to a device */
...
...
@@ -312,13 +294,6 @@ void XDevice::SetFastFlagsAllDevices()
#endif
}
/* delete the default stream for the device */
void
XDevice
::
DelDeviceStream
()
{
if
(
stream
!=
NULL
)
delete
stream
;
}
/* constructor */
XDevManager
::
XDevManager
()
{
...
...
@@ -391,14 +366,6 @@ cublasHandle_t * XDevManager::GetCudaHandle(const int devID)
return
GPUs
[
devID
].
GetCublasHandle
();
}
/* get the stream of a given GPU */
cudaStream_t
*
XDevManager
::
GetCudaStream
(
const
int
devID
)
{
CheckNTErrors
(
devID
<
nGPU
,
"index of GPU is out of range."
);
return
GPUs
[
devID
].
GetCudaStream
();
}
#endif
/*
...
...
@@ -620,16 +587,5 @@ char * XDevManager::GetDevString(int devID)
}
}
/* delete the streams for all devices */
void
XDevManager
::
DelDeviceStream
()
{
for
(
int
i
=
0
;
i
<
GDevs
.
nCPU
;
i
++
)
{
GDevs
.
CPUs
[
i
].
DelDeviceStream
();
}
for
(
int
i
=
0
;
i
<
GDevs
.
nGPU
;
i
++
)
{
GDevs
.
GPUs
[
i
].
DelDeviceStream
();
}
}
}
/* end of the nts (NiuTrans.Tensor) namespace */
source/tensor/XDevice.h
查看文件 @
dd7a67bc
...
...
@@ -25,7 +25,6 @@
#define __XDEVICE_H__
#include "XThread.h"
#include "XStream.h"
#ifdef USE_CUDA
...
...
@@ -97,9 +96,6 @@ public:
/* specify whether Unified Virtual Address Space (UVA) is supported */
bool
isUVASupported
;
/* default stream for the device */
XStream
*
stream
;
/* seed for random number generation */
int
seed
;
...
...
@@ -140,9 +136,6 @@ public:
#ifdef USE_CUDA
/* get cublas handle */
cublasHandle_t
*
GetCublasHandle
();
/* get the stream of cuda */
cudaStream_t
*
GetCudaStream
();
#endif
/* switch to a device */
...
...
@@ -164,9 +157,6 @@ public:
/* reset cuda flag for more efficient cuda execution (all devices) */
static
void
SetFastFlagsAllDevices
();
/* delete the default stream for the device (call it before deleting the XDevice) */
void
DelDeviceStream
();
};
/*
...
...
@@ -206,9 +196,6 @@ public:
#ifdef USE_CUDA
/* get the handle of GPU */
cublasHandle_t
*
GetCudaHandle
(
const
int
devID
);
/* get the stream of cuda */
cudaStream_t
*
GetCudaStream
(
const
int
devID
);
#endif
/* get grid and block sizes that max potential */
...
...
@@ -228,10 +215,6 @@ public:
/* get the device information in string */
char
*
GetDevString
(
int
devID
);
/* delete the streams for all devices */
static
void
DelDeviceStream
();
};
/* managing the devices */
...
...
source/tensor/XQueue.cpp
查看文件 @
dd7a67bc
...
...
@@ -70,9 +70,6 @@ XQueue::XQueue(int mySize)
jobDequeuerArgs
=
new
TensorList
(
1
);
jobDequeuerBreak
=
false
;
runningJobCount
=
0
;
jobStream
=
NULL
;
jobStream1
=
NULL
;
jobStream2
=
NULL
;
MUTEX_INIT
(
enqueueMutex
);
MUTEX_INIT
(
dequeueMutex
);
...
...
@@ -85,9 +82,6 @@ XQueue::~XQueue()
{
delete
[]
queue
;
delete
jobDequeuerArgs
;
delete
jobStream
;
delete
jobStream1
;
delete
jobStream2
;
//if(isJobQueue)
// StopJobConsumer();
...
...
@@ -160,19 +154,6 @@ void XQueue::WaitForEmptyJobQueue()
while
(
runningJobCount
>
0
){
XSleep
(
10
);
}
if
(
jobStream
!=
NULL
){
CheckNTErrors
((
jobStream
->
IsFinished
()),
"None fineished jobs remain"
);
jobStream
->
Clear
();
}
if
(
jobStream1
!=
NULL
){
CheckNTErrors
((
jobStream1
->
IsFinished
()),
"None fineished jobs remain"
);
jobStream1
->
Clear
();
}
if
(
jobStream2
!=
NULL
){
CheckNTErrors
((
jobStream2
->
IsFinished
()),
"None fineished jobs remain"
);
jobStream2
->
Clear
();
}
}
int
devids
[
16
]
=
{
0
,
1
,
2
,
3
,
4
,
5
,
6
,
7
,
8
,
9
,
10
,
11
,
12
,
13
,
14
,
15
};
...
...
@@ -268,31 +249,4 @@ bool XQueue::GetJobBreak()
return
jobDequeuerBreak
;
}
/* get job stream */
XStream
*
XQueue
::
GetJobStream
(
int
n
)
{
if
(
n
==
0
)
return
jobStream
;
else
if
(
n
==
1
)
return
jobStream1
;
else
if
(
n
==
2
)
return
jobStream2
;
else
{
ShowNTErrors
(
"invalid stream id!"
);
}
return
NULL
;
}
/* make job streams */
void
XQueue
::
MakeJobStreams
(
int
devID
,
int
devID1
,
int
devID2
)
{
if
(
devID
!=
INVALID_DEVICE_ID
)
jobStream
=
new
XStream
(
0
,
devID
);
if
(
devID1
!=
INVALID_DEVICE_ID
)
jobStream1
=
new
XStream
(
0
,
devID1
);
if
(
devID2
!=
INVALID_DEVICE_ID
)
jobStream2
=
new
XStream
(
0
,
devID2
);
}
}
/* end of the nts (NiuTrans.Tensor) namespace */
source/tensor/XQueue.h
查看文件 @
dd7a67bc
...
...
@@ -33,7 +33,6 @@
#include "XGlobal.h"
#include "XThread.h"
#include "XStream.h"
#include "XDevice.h"
#include "XList.h"
...
...
@@ -110,11 +109,6 @@ private:
/* running job count */
int
runningJobCount
;
/* job streams (we think that three streams is enough :)) */
XStream
*
jobStream
;
XStream
*
jobStream1
;
XStream
*
jobStream2
;
public
:
/* constuctor */
XQueue
(
int
mySize
=
MAX_QUEUE_SIZE
);
...
...
@@ -149,12 +143,6 @@ public:
/* get the break flag */
bool
GetJobBreak
();
/* get job stream */
XStream
*
GetJobStream
(
int
n
=
0
);
/* make job streams */
void
MakeJobStreams
(
int
devID
=
INVALID_DEVICE_ID
,
int
devID1
=
INVALID_DEVICE_ID
,
int
devID2
=
INVALID_DEVICE_ID
);
};
}
/* end of the nts (NiuTrans.Tensor) namespace */
...
...
source/tensor/XStream.cpp
deleted
100644 → 0
查看文件 @
158ccc4a
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#include "stdio.h"
#include "stdlib.h"
#include "XGlobal.h"
#include "XStream.h"
#include "XDevice.h"
/* the nts (NiuTrans.Tensor) namespace */
namespace
nts
{
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asynchronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
/* constructor */
XStream
::
XStream
(
int
priority
,
int
myDevID
,
int
myMaxEventNum
)
{
devID
=
myDevID
;
#ifdef USE_CUDA
if
(
myDevID
>=
0
){
int
backupDevID
=
XDevice
::
GetGPUDevice
();
XDevice
::
SetGPUDevice
(
myDevID
);
events
=
new
cudaEvent_t
[
myMaxEventNum
];
XDevice
::
SetGPUDevice
(
backupDevID
);
maxEventNum
=
myMaxEventNum
;
usedEventNum
=
0
;
}
else
{
maxEventNum
=
0
;
usedEventNum
=
0
;
}
#endif
Create
(
priority
,
devID
);
}
/* deconstructor */
XStream
::~
XStream
()
{
Destroy
();
#ifdef USE_CUDA
delete
[]
events
;
#endif
}
/* create the stream */
void
XStream
::
Create
(
int
priority
,
int
myDevID
)
{
if
(
myDevID
<
0
)
return
;
#ifdef USE_CUDA
int
backupDevID
=
XDevice
::
GetGPUDevice
();
XDevice
::
SetGPUDevice
(
myDevID
);
//cudaStreamCreateWithPriority(&stream, cudaStreamDefault, priority);
CheckNTErrors
((
cudaStreamCreate
(
&
stream
)
==
cudaSuccess
),
"cannot create the cuda stream!"
);
XDevice
::
SetGPUDevice
(
backupDevID
);
#endif
devID
=
myDevID
;
}
/* destroy the stream */
void
XStream
::
Destroy
()
{
if
(
devID
<
0
)
return
;
#ifdef USE_CUDA
int
backupDevID
=
XDevice
::
GetGPUDevice
();
XDevice
::
SetGPUDevice
(
devID
);
cudaStreamDestroy
(
stream
);
XDevice
::
SetGPUDevice
(
backupDevID
);
Clear
();
#endif
}
/* clear it */
void
XStream
::
Clear
()
{
#ifdef USE_CUDA
int
backupDevID
=
XDevice
::
GetGPUDevice
();
XDevice
::
SetGPUDevice
(
devID
);
for
(
int
i
=
0
;
i
<
usedEventNum
;
i
++
){
cudaEventDestroy
(
events
[
i
]);
}
usedEventNum
=
0
;
XDevice
::
SetGPUDevice
(
backupDevID
);
#endif
}
/* judge if all the jobs in the stream have been finished */
bool
XStream
::
IsFinished
()
{
#ifdef USE_CUDA
if
(
cudaStreamQuery
(
stream
)
==
cudaSuccess
)
return
true
;
else
return
false
;
#else
return
true
;
#endif
}
void
XStream
::
StreamSynchronize
()
{
#ifdef USE_CUDA
int
devIDBackup
=
XDevice
::
GetGPUDevice
();
if
(
devID
!=
devIDBackup
)
XDevice
::
SetGPUDevice
(
devID
);
cudaStreamSynchronize
(
stream
);
if
(
devID
!=
devIDBackup
)
XDevice
::
SetGPUDevice
(
devIDBackup
);
#endif
}
void
XStream
::
ThreadSynchronize
()
{
#ifdef USE_CUDA
#if CUDART_VERSION < 10000
cudaThreadSynchronize
();
#else
ShowNTErrors
(
"TODO!"
);
#endif
#endif
}
void
XStream
::
DeviceSynchronize
(
int
devID
)
{
#ifdef USE_CUDA
int
devIDBackup
=
XDevice
::
GetGPUDevice
();
cudaGetDevice
(
&
devIDBackup
);
if
(
devID
!=
devIDBackup
)
XDevice
::
SetGPUDevice
(
devID
);
cudaDeviceSynchronize
();
if
(
devID
!=
devIDBackup
)
XDevice
::
SetGPUDevice
(
devIDBackup
);
#endif
}
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void
XStream
::
MakeDependency
(
XStream
*
precedingStream
)
{
#ifdef USE_CUDA
cudaEvent_t
*
e
=
precedingStream
->
MakeEvent
();
cudaEventRecord
(
*
e
,
precedingStream
->
stream
);
cudaStreamWaitEvent
(
stream
,
*
e
,
0
);
#endif
}
/* get the stream */
#ifdef USE_CUDA
inline
cudaStream_t
*
XStream
::
Get
()
{
return
&
stream
;
}
/* make a event */
inline
cudaEvent_t
*
XStream
::
MakeEvent
()
{
int
backupDevID
=
XDevice
::
GetGPUDevice
();
XDevice
::
SetGPUDevice
(
devID
);
CheckNTErrors
((
usedEventNum
<
maxEventNum
),
"Too many events are required!"
);
cudaEvent_t
*
e
=
events
+
usedEventNum
++
;
cudaEventCreate
(
e
);
XDevice
::
SetGPUDevice
(
backupDevID
);
return
e
;
}
#endif
}
/* end of the nts (NiuTrans.Tensor) namespace */
source/tensor/XStream.h
deleted
100644 → 0
查看文件 @
158ccc4a
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2017, Natural Language Processing Lab, Northeastern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
*
* This is for streaming (on GPU), i.e., run jobs in different stream for
* GPU Async capabilities.
*
* $Created by: XIAO Tong (xiaotong@mail.neu.edu.cn) 2016-03-09
*
*/
#ifndef __XSTREAM_H__
#define __XSTREAM_H__
/* the CUDA stuff */
#ifdef USE_CUDA
#include <cuda_runtime.h>
#include <cublas_v2.h>
#include <cuda_fp16.h>
#endif
/* the nts (NiuTrans.Tensor) namespace */
namespace
nts
{
#define MAX_CUDA_EVENT_NUM_IN_A_STREAM 128
/*
This class defines the stream used in pipelining jobs. E.g., one can put
a sequence of jobs in a stream and asychronously do something else. Basically
we can use multiply streams to hide the data transfer cost on GPUs by using
job overlaps.
*/
class
XStream
{
public
:
#ifdef USE_CUDA
/* the cuda stream */
cudaStream_t
stream
;
/* list of cuda events for synchronize different streams */
cudaEvent_t
*
events
;
/* max number of the events */
int
maxEventNum
;
/* number of used events */
int
usedEventNum
;
#else
/* virtual pointer */
void
*
stream
;
#endif
/* device that holds the stream */
int
devID
;
public
:
/* constructor */
XStream
(
int
priority
=
0
,
int
devID
=
0
,
int
maxEventNum
=
MAX_CUDA_EVENT_NUM_IN_A_STREAM
);
/* deconstructor */
~
XStream
();
/* create the stream */
void
Create
(
int
priority
=
0
,
int
devID
=
0
);
/* destroy the stream */
void
Destroy
();
/* clear it */
void
Clear
();
/* judge if all the jobs in the stream have been finished */
bool
IsFinished
();
/* stream synchronize */
void
StreamSynchronize
();
/* thread synchronize */
static
void
ThreadSynchronize
();
/* device synchronize */
static
void
DeviceSynchronize
(
int
devID
);
/* make a dependency of two streams. i.e., current stream must wait for the last job finished in another stream */
void
MakeDependency
(
XStream
*
precedingStream
);
#ifdef USE_CUDA
/* get the stream */
cudaStream_t
*
Get
();
/* make a event */
cudaEvent_t
*
MakeEvent
();
#endif
};
}
/* end of the nts (NiuTrans.Tensor) namespace */
#endif
source/tensor/XTensor.cpp
查看文件 @
dd7a67bc
...
...
@@ -772,10 +772,9 @@ MTYPE XTensor::GetOffset3D(int d0, int d1, int d2) const
}
/*
a vector with all entries of 0
>> stream - stream for the job pipeline
a tensor with all entries of 0
*/
void
XTensor
::
SetZeroAll
(
XStream
*
stream
)
void
XTensor
::
SetZeroAll
()
{
if
(
data
==
NULL
)
return
;
...
...
@@ -788,12 +787,7 @@ void XTensor::SetZeroAll(XStream* stream)
int
devIDBackup
=
0
;
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
stream
==
NULL
)
cudaMemset
(
data
,
0
,
size
);
else
cudaMemsetAsync
(
data
,
0
,
size
,
stream
->
stream
);
cudaMemset
(
data
,
0
,
size
);
cudaSetDevice
(
devIDBackup
);
#endif
}
...
...
@@ -807,13 +801,8 @@ void XTensor::SetZeroAll(XStream* stream)
#ifdef USE_CUDA
int
devIDBackup
=
0
;
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
stream
==
NULL
)
cudaMemset
(
data
,
0
,
unitNum
*
unitSize
);
else
cudaMemsetAsync
(
data
,
0
,
unitNum
*
unitSize
,
stream
->
stream
);
cudaSetDevice
(
devID
);
cudaMemset
(
data
,
0
,
unitNum
*
unitSize
);
cudaSetDevice
(
devIDBackup
);
#endif
}
...
...
source/tensor/XTensor.h
查看文件 @
dd7a67bc
...
...
@@ -31,7 +31,6 @@
#include <math.h>
#include "XGlobal.h"
#include "XPRunner.h"
#include "XStream.h"
#include "XHeap.h"
#include "XList.h"
#include "XDataType.h"
...
...
@@ -303,7 +302,7 @@ public:
MTYPE
GetOffset3D
(
int
d0
,
int
d1
,
int
d2
)
const
;
/* a tensor with all entries of 0 */
void
SetZeroAll
(
XStream
*
stream
=
NULL
);
void
SetZeroAll
();
/* set the tensor with an data array */
void
SetData
(
const
void
*
d
,
int
num
,
int
beg
=
0
);
...
...
source/tensor/XUtility.cpp
查看文件 @
dd7a67bc
...
...
@@ -311,44 +311,6 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
#endif
}
void
XMemCopy2DAsync
(
void
*
t
,
size_t
tPitch
,
int
devIDT
,
const
void
*
s
,
size_t
sPitch
,
int
devIDS
,
size_t
mSize
,
int
n
,
XStream
*
stream
)
{
if
(
t
==
s
)
return
;
if
(
devIDT
<
0
&&
devIDS
<
0
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
memcpy
((
char
*
)
t
+
tPitch
*
i
,
(
char
*
)
s
+
sPitch
*
i
,
mSize
);
return
;
}
#ifdef USE_CUDA
else
{
CheckNTErrors
(
stream
!=
NULL
,
"No stream found!"
);
cudaStream_t
&
cstream
=
stream
->
stream
;
if
(
devIDT
>=
0
&&
devIDS
<
0
)
{
cudaError_t
error
=
cudaMemcpy2DAsync
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyHostToDevice
,
cstream
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"
);
}
}
else
if
(
devIDT
<
0
&&
devIDS
>=
0
)
{
cudaError_t
error
=
cudaMemcpy2DAsync
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToHost
,
cstream
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
}
}
else
{
cudaError_t
error
=
cudaMemcpy2DAsync
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToDevice
,
cstream
);
if
(
error
!=
cudaSuccess
)
{
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToDevice)"
);
}
}
}
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
#endif
}
void
*
XMemAlloc
(
int
devID
,
size_t
size
)
{
void
*
p
=
NULL
;
...
...
source/tensor/XUtility.h
查看文件 @
dd7a67bc
...
...
@@ -42,7 +42,6 @@ extern void XMemSet(void * p, int value, size_t size);
extern
void
XMemSet
(
int
devID
,
void
*
p
,
int
value
,
size_t
size
);
extern
void
XMemCopy
(
void
*
t
,
int
devIDT
,
const
void
*
s
,
int
devIDS
,
size_t
size
);
extern
void
XMemCopy2D
(
void
*
t
,
size_t
tPitch
,
int
devIDT
,
const
void
*
s
,
size_t
sPitch
,
int
devIDS
,
size_t
mSize
,
int
n
);
extern
void
XMemCopy2DAsync
(
void
*
t
,
size_t
tPitch
,
int
devIDT
,
const
void
*
s
,
size_t
sPitch
,
int
devIDS
,
size_t
mSize
,
int
n
,
XStream
*
stream
);
extern
void
*
XMemAlloc
(
int
devID
,
size_t
size
);
extern
void
*
XMemAllocOnDev
(
int
devID
,
size_t
size
);
extern
void
XMemFree
(
int
devID
,
void
*
p
);
...
...
source/tensor/core/arithmetic/MatrixMul2D.cpp
查看文件 @
dd7a67bc
...
...
@@ -42,12 +42,11 @@ where trans() return the transposed matrix if the flag is fired
>> alpha - a coefficient
>> beta - another coefficient
>> parallelRunner - parallel processing module
>> stream - the string for creating the job pipeline
*/
void
_MatrixMul2D
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
,
DTYPE
beta
,
XPRunner
*
parallelRunner
,
XStream
*
stream
)
XPRunner
*
parallelRunner
)
{
CheckNTErrors
((
a
&&
b
&&
c
),
"Empty input tensors!"
);
CheckNTErrors
((
a
->
dataType
==
b
->
dataType
),
"Input tensors should have the same data type!"
);
...
...
@@ -69,7 +68,7 @@ void _MatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
#ifdef USE_CUDA
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
)
{
_CudaMatrixMul2D
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
,
stream
);
_CudaMatrixMul2D
(
a
,
transposedA
,
b
,
transposedB
,
c
,
alpha
,
beta
);
return
;
}
#endif
...
...
source/tensor/core/arithmetic/MatrixMul2D.cu
查看文件 @
dd7a67bc
...
...
@@ -119,11 +119,10 @@ where trans() return the transposed matrix if the flag is fired
>> c - where we put a*b
>> alpha - a coefficient
>> beta - another coefficient
>> stream - the string for creating the job pipeline
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
const XTensor * b, MATRIX_TRANS_TYPE transposedB,
XTensor * c, DTYPE alpha, DTYPE beta
, XStream * stream
)
XTensor * c, DTYPE alpha, DTYPE beta)
{
int an = transposedA == X_TRANS ? a->dimSize[1] : a->dimSize[0];
int am = transposedA == X_TRANS ? a->dimSize[0] : a->dimSize[1];
...
...
@@ -152,10 +151,6 @@ void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA,
cublasHandle_t * handle = a->mem == NULL ? GDevs.GetCudaHandle(a->devID) : a->mem->GetCublasHandle();
/* !!!! might have problems */
if (stream != NULL)
cublasSetStream(*handle, stream->stream);
if (beta == 0)
c->SetZeroAll();
...
...
source/tensor/core/arithmetic/MatrixMul2D.cuh
查看文件 @
dd7a67bc
...
...
@@ -43,7 +43,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void _CudaMatrixMul2D(const XTensor * a, MATRIX_TRANS_TYPE transposedA, const XTensor * b, MATRIX_TRANS_TYPE transposedB, XTensor * c,
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0
, XStream * stream = NULL
);
DTYPE alpha = (DTYPE)1.0, DTYPE beta = 0);
#endif // USE_CUDA
...
...
source/tensor/core/arithmetic/MatrixMul2D.h
查看文件 @
dd7a67bc
...
...
@@ -32,7 +32,7 @@ c = trans(a) * trans(b) * alpha + c * beta
where trans() return the transposed matrix if the flag is fired
*/
void
_MatrixMul2D
(
const
XTensor
*
a
,
MATRIX_TRANS_TYPE
transposedA
,
const
XTensor
*
b
,
MATRIX_TRANS_TYPE
transposedB
,
XTensor
*
c
,
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
,
XPRunner
*
parallelRunner
=
NULL
,
XStream
*
stream
=
NULL
);
DTYPE
alpha
=
(
DTYPE
)
1
.
0
,
DTYPE
beta
=
0
,
XPRunner
*
parallelRunner
=
NULL
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/movement/CopyValues.cpp
查看文件 @
dd7a67bc
...
...
@@ -32,9 +32,8 @@ copy s to t
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
*/
void
_CopyValues
(
const
XTensor
*
s
,
XTensor
*
t
,
XStream
*
stream
)
void
_CopyValues
(
const
XTensor
*
s
,
XTensor
*
t
)
{
if
(
s
->
data
==
NULL
&&
t
->
data
==
NULL
)
return
;
...
...
@@ -55,7 +54,7 @@ void _CopyValues(const XTensor * s, XTensor * t, XStream * stream)
#ifdef USE_CUDA
if
(
s
->
devID
>=
0
||
t
->
devID
>=
0
)
{
_CudaCopyValues
(
s
,
t
,
stream
);
_CudaCopyValues
(
s
,
t
);
return
;
}
#endif
...
...
@@ -82,9 +81,8 @@ copy s to t
>> sLen - length of the segment
>> t - target
>> tBeg - beginning of the segment on the target side
>> stream - the stream for creating the job pipeline
*/
void
_CopyValues
(
const
XTensor
*
s
,
const
int
sBeg
,
const
int
sLen
,
XTensor
*
t
,
const
int
tBeg
,
XStream
*
stream
)
void
_CopyValues
(
const
XTensor
*
s
,
const
int
sBeg
,
const
int
sLen
,
XTensor
*
t
,
const
int
tBeg
)
{
if
(
s
->
data
==
NULL
&&
t
->
data
==
NULL
)
return
;
...
...
@@ -108,13 +106,12 @@ void _CopyValues(const XTensor * s, const int sBeg, const int sLen, XTensor * t,
/*
copy s to t (rename _CopyValues)
>> s - source
>> t - target
>> stream - the stream for creating the job pipeline
>> s - source
>> t - target
*/
void
CopyValues
(
const
XTensor
&
s
,
XTensor
&
t
,
XStream
*
stream
)
void
CopyValues
(
const
XTensor
&
s
,
XTensor
&
t
)
{
_CopyValues
(
&
s
,
&
t
,
stream
);
_CopyValues
(
&
s
,
&
t
);
}
/*
...
...
@@ -122,16 +119,15 @@ copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
>> s - source
>> stream - the stream for creating the job pipeline
<< return - the copyed tensor t
*/
XTensor
CopyValues
(
const
XTensor
&
s
,
XStream
*
stream
)
XTensor
CopyValues
(
const
XTensor
&
s
)
{
XTensor
t
(
&
s
);
t
.
SetTMPFlag
();
/* call _CopyValues function */
_CopyValues
(
&
s
,
&
t
,
stream
);
_CopyValues
(
&
s
,
&
t
);
/* tensor connection */
if
(
s
.
enableGrad
)
{
...
...
source/tensor/core/movement/CopyValues.cu
查看文件 @
dd7a67bc
...
...
@@ -32,10 +32,9 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
copy a range of elements from a source vector to a target vector
>> s - source matrix
>> t - target matrix
>> stream - the stream for creating the job pipeline
<< return - succeed or not
*/
void _CudaCopyValues(const XTensor * s, XTensor * t
, XStream * stream
)
void _CudaCopyValues(const XTensor * s, XTensor * t)
{
CheckNTErrors(s != NULL && t != NULL, "The input tensor and output tensor must be nonempty!");
CheckNTErrors(s->dataType == t->dataType, "Unmatched data type!");
...
...
@@ -45,10 +44,7 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
/* dense -> dense */
if (!s->isSparse && !t->isSparse) {
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, s->unitSize * s->unitNum);
}
/* dense -> sparse */
else if (!s->isSparse && t->isSparse &&
...
...
@@ -72,11 +68,8 @@ void _CudaCopyValues(const XTensor * s, XTensor * t, XStream * stream)
int num = s->unitNumNonZero;
int size = sizeof(int) + num * (s->unitSize + sizeof(int));
if (stream == NULL)
XMemCopy(t->data, t->devID, s->data, s->devID, size);
else
XMemCopyAsync(t->data, t->devID, s->data, s->devID, size, stream->stream, stream->devID);
XMemCopy(t->data, t->devID, s->data, s->devID, size);
t->unitNumNonZero = num;
}
else {
...
...
source/tensor/core/movement/CopyValues.cuh
查看文件 @
dd7a67bc
...
...
@@ -29,7 +29,7 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* copy all elements from a source matrix to a target matrix */
void _CudaCopyValues(const XTensor * s, XTensor * t
, XStream * stream = NULL
);
void _CudaCopyValues(const XTensor * s, XTensor * t);
#endif // USE_CUDA
...
...
source/tensor/core/movement/CopyValues.h
查看文件 @
dd7a67bc
...
...
@@ -27,19 +27,19 @@
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* copy s to t */
void
_CopyValues
(
const
XTensor
*
s
,
XTensor
*
t
,
XStream
*
stream
=
NULL
);
void
_CopyValues
(
const
XTensor
*
s
,
XTensor
*
t
);
/* copy a segment of s to t */
void
_CopyValues
(
const
XTensor
*
s
,
const
int
sBeg
,
const
int
sLen
,
XTensor
*
t
,
const
int
tBeg
,
XStream
*
stream
=
NULL
);
void
_CopyValues
(
const
XTensor
*
s
,
const
int
sBeg
,
const
int
sLen
,
XTensor
*
t
,
const
int
tBeg
);
/* copy s to t (rename _CopyValues) */
void
CopyValues
(
const
XTensor
&
s
,
XTensor
&
t
,
XStream
*
stream
=
NULL
);
void
CopyValues
(
const
XTensor
&
s
,
XTensor
&
t
);
/*
copy s to t (return an XTensor structure)
make a new tensor to keep the result and return it
*/
XTensor
CopyValues
(
const
XTensor
&
s
,
XStream
*
stream
=
NULL
);
XTensor
CopyValues
(
const
XTensor
&
s
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/shape/Split.cpp
查看文件 @
dd7a67bc
...
...
@@ -96,25 +96,11 @@ void _Split(const XTensor * s, XTensor * t, int whereToSplit, int splitNum)
}
}
else
{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream
*
stream
=
GDevs
.
GPUs
[
t
->
devID
].
stream
;
for
(
int
k
=
0
;
k
<
splitNum
;
k
++
)
{
XMemCopy2DAsync
((
char
*
)
t
->
data
+
k
*
tStep
,
tPitch
,
t
->
devID
,
(
char
*
)
s
->
data
+
k
*
sStep
,
sPitch
,
s
->
devID
,
mSize
,
n
,
stream
);
}
stream
->
StreamSynchronize
();
#else
for
(
int
k
=
0
;
k
<
splitNum
;
k
++
)
{
XMemCopy2D
((
char
*
)
t
->
data
+
k
*
tStep
,
tPitch
,
t
->
devID
,
(
char
*
)
s
->
data
+
k
*
sStep
,
sPitch
,
s
->
devID
,
mSize
,
n
);
}
#endif
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
#endif
}
}
else
{
...
...
@@ -321,27 +307,12 @@ void _Split(const XTensor * big, TensorList * smalls, int whereToSplit, int spli
}
}
else
{
#ifdef USE_CUDA
#ifdef STREAMED_MEMCPOPY
XStream
*
stream
=
GDevs
.
GPUs
[
big
->
devID
].
stream
;
for
(
int
k
=
0
;
k
<
splitNum
;
k
++
)
{
XTensor
*
t
=
(
XTensor
*
)
smalls
->
GetItem
(
k
);
XMemCopy2DAsync
((
char
*
)
t
->
data
+
k
*
tStep
,
tPitch
,
t
->
devID
,
(
char
*
)
big
->
data
+
k
*
sStep
,
sPitch
,
big
->
devID
,
mSize
,
n
,
stream
);
}
stream
->
StreamSynchronize
();
#else
for
(
int
k
=
0
;
k
<
splitNum
;
k
++
)
{
XTensor
*
t
=
(
XTensor
*
)
smalls
->
GetItem
(
k
);
XMemCopy2D
((
char
*
)
t
->
data
+
k
*
tStep
,
tPitch
,
t
->
devID
,
(
char
*
)
big
->
data
+
k
*
sStep
,
sPitch
,
big
->
devID
,
mSize
,
n
);
}
#endif
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
#endif
}
}
/* splitting with fewer kernel/api calls??? (i'm not sure about it!! may remove this later) */
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论