Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
454bd870
Commit
454bd870
authored
Jul 29, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
rename SumFilled as SumDim and add the new code
parent
dcba416c
隐藏空白字符变更
内嵌
并排
正在显示
11 个修改的文件
包含
445 行增加
和
137 行删除
+445
-137
source/network/Main.cpp
+63
-2
source/tensor/XTensor.h
+1
-1
source/tensor/XUtility.cpp
+51
-33
source/tensor/core/CHeader.h
+1
-1
source/tensor/core/arithmetic/Sum.cpp
+1
-1
source/tensor/core/arithmetic/SumDim.cpp
+113
-0
source/tensor/core/arithmetic/SumDim.cu
+172
-0
source/tensor/core/arithmetic/SumDim.cuh
+9
-2
source/tensor/core/arithmetic/SumDim.h
+34
-25
source/tensor/core/arithmetic/SumFilled.cpp
+0
-27
source/tensor/core/arithmetic/SumFilled.h
+0
-45
没有找到文件。
source/network/Main.cpp
查看文件 @
454bd870
...
@@ -31,6 +31,7 @@
...
@@ -31,6 +31,7 @@
//#include <crtdbg.h>
//#include <crtdbg.h>
void
TransposeTest
();
void
TransposeTest
();
void
SumDimTest
();
using
namespace
nts
;
using
namespace
nts
;
using
namespace
samplefnnlm
;
using
namespace
samplefnnlm
;
...
@@ -40,6 +41,9 @@ int main( int argc, const char ** argv )
...
@@ -40,6 +41,9 @@ int main( int argc, const char ** argv )
//TransposeTest();
//TransposeTest();
//return 0;
//return 0;
SumDimTest
();
return
0
;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
1
;
//Test();
1
;
//Test();
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
else
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-fnnlm"
))
...
@@ -94,6 +98,7 @@ void TransposeTest()
...
@@ -94,6 +98,7 @@ void TransposeTest()
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
//XMem mem1(1, UNI_FREE, MILLION * 64, 1024, MILLION * 64);
XTensor
x
;
XTensor
x
;
XTensor
y
;
XTensor
y
;
XTensor
z
;
int
loops
=
2000
;
int
loops
=
2000
;
...
@@ -106,19 +111,25 @@ void TransposeTest()
...
@@ -106,19 +111,25 @@ void TransposeTest()
InitTensor3D
(
&
x
,
B
,
N
,
H
,
X_FLOAT
,
0
);
InitTensor3D
(
&
x
,
B
,
N
,
H
,
X_FLOAT
,
0
);
InitTensor4D
(
&
y
,
K
,
B
,
N
,
H
/
K
,
X_FLOAT
,
0
);
InitTensor4D
(
&
y
,
K
,
B
,
N
,
H
/
K
,
X_FLOAT
,
0
);
InitTensor3D
(
&
z
,
B
,
N
,
H
,
X_FLOAT
,
0
);
cudaEvent_t
ctime0
;
cudaEvent_t
ctime0
;
cudaEvent_t
ctime1
;
cudaEvent_t
ctime1
;
cudaEvent_t
ctime2
;
cudaEvent_t
ctime2
;
cudaEvent_t
ctime3
;
cudaEvent_t
ctime3
;
cudaEvent_t
ctime4
;
cudaEvent_t
ctime5
;
float
elapsedSplit
=
0.0
;
float
elapsedSplit
=
0.0
;
float
elapsedMerge
=
0.0
;
float
elapsedMerge
=
0.0
;
float
elapsedSum
=
0.0
;
cudaEventCreate
(
&
ctime0
);
cudaEventCreate
(
&
ctime0
);
cudaEventCreate
(
&
ctime1
);
cudaEventCreate
(
&
ctime1
);
cudaEventCreate
(
&
ctime2
);
cudaEventCreate
(
&
ctime2
);
cudaEventCreate
(
&
ctime3
);
cudaEventCreate
(
&
ctime3
);
cudaEventCreate
(
&
ctime4
);
cudaEventCreate
(
&
ctime5
);
cudaEventRecord
(
ctime0
,
0
);
cudaEventRecord
(
ctime0
,
0
);
...
@@ -142,7 +153,57 @@ void TransposeTest()
...
@@ -142,7 +153,57 @@ void TransposeTest()
cudaEventSynchronize
(
ctime3
);
cudaEventSynchronize
(
ctime3
);
cudaEventElapsedTime
(
&
elapsedMerge
,
ctime2
,
ctime3
);
cudaEventElapsedTime
(
&
elapsedMerge
,
ctime2
,
ctime3
);
fprintf
(
stderr
,
"split:%f merge:%f
\n
"
,
time1
-
time0
,
time3
-
time2
);
cudaEventRecord
(
ctime4
,
0
);
fprintf
(
stderr
,
"split:%f merge:%f
\n
"
,
elapsedSplit
,
elapsedMerge
);
double
time4
=
GetClock
();
for
(
int
i
=
0
;
i
<
loops
;
i
++
)
_Sum
(
&
x
,
&
z
,
&
x
);
double
time5
=
GetClock
();
cudaEventRecord
(
ctime5
,
0
);
cudaEventSynchronize
(
ctime5
);
cudaEventElapsedTime
(
&
elapsedSum
,
ctime4
,
ctime5
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
time1
-
time0
,
time3
-
time2
,
time5
-
time4
);
fprintf
(
stderr
,
"split:%f merge:%f sum:%f
\n
"
,
elapsedSplit
,
elapsedMerge
,
elapsedSum
);
#endif
#endif
}
}
void
SumDimTest
()
{
XTensor
x
;
XTensor
y
;
XTensor
z
;
int
loops
=
2000
;
int
a
=
5
;
int
b
=
7
;
int
c
=
3
;
XDevice
::
SetGPUDevice
(
0
);
InitTensor3D
(
&
x
,
a
,
b
,
c
,
X_FLOAT
,
0
);
InitTensor1D
(
&
y
,
c
,
X_FLOAT
,
0
);
InitTensor3D
(
&
z
,
a
,
b
,
c
,
X_FLOAT
,
0
);
x
.
SetZeroAll
();
y
.
SetZeroAll
();
z
.
SetZeroAll
();
float
*
data
=
new
float
[
x
.
unitNum
];
for
(
int
i
=
0
;
i
<
x
.
unitNum
;
i
++
)
data
[
i
]
=
(
DTYPE
)
i
;
x
.
SetData
(
data
,
x
.
unitNum
);
for
(
int
i
=
0
;
i
<
y
.
unitNum
;
i
++
)
data
[
i
]
=
-
(
DTYPE
)
i
;
y
.
SetData
(
data
,
y
.
unitNum
);
_SumDim
(
&
x
,
&
y
,
&
z
,
2
);
z
.
Dump
(
stderr
,
"z:"
);
delete
[]
data
;
}
source/tensor/XTensor.h
查看文件 @
454bd870
...
@@ -45,7 +45,7 @@ namespace nts{
...
@@ -45,7 +45,7 @@ namespace nts{
struct
XLink
;
struct
XLink
;
/* define the maximum number of dimensions in a tensor */
/* define the maximum number of dimensions in a tensor */
#define MAX_TENSOR_DIM_NUM
6
#define MAX_TENSOR_DIM_NUM
8
#define USE_BATCHED_STRIDED_MAT_MUL
#define USE_BATCHED_STRIDED_MAT_MUL
#define MIN_TENSOR_SPLIT_NUM 0
#define MIN_TENSOR_SPLIT_NUM 0
#define MIN_TENSOR_SPLIT_LIST_NUM 1024
#define MIN_TENSOR_SPLIT_LIST_NUM 1024
...
...
source/tensor/XUtility.cpp
查看文件 @
454bd870
...
@@ -175,29 +175,38 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
...
@@ -175,29 +175,38 @@ void XMemCopy(void * t, int devIDT, const void * s, int devIDS, size_t size)
return
;
return
;
}
}
#ifdef USE_CUDA
#ifdef USE_CUDA
else
if
(
devIDT
>=
0
&&
devIDS
<
0
){
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyHostToDevice
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyHostToDevice)"
);
}
}
else
if
(
devIDT
<
0
&&
devIDS
>=
0
){
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyDeviceToHost
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
}
}
else
{
else
{
//if(devIDT == devIDS){
int
devID
=
devIDT
<
0
?
devIDS
:
devIDT
;
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyDeviceToDevice
);
int
devIDBackup
=
0
;
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
devIDT
>=
0
&&
devIDS
<
0
){
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyHostToDevice
);
if
(
error
!=
cudaSuccess
){
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToDevice)"
);
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyHostToDevice)"
);
}
}
else
if
(
devIDT
<
0
&&
devIDS
>=
0
){
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyDeviceToHost
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
}
}
/*
}
}
else
{
else
{
CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
//if(devIDT == devIDS){
"cudaMemcpy error (cudaMemcpyDeviceToDevice)");
cudaError_t
error
=
cudaMemcpy
(
t
,
s
,
size
,
cudaMemcpyDeviceToDevice
);
}*/
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToDevice)"
);
}
/*}
else{
CheckNTErrors((cudaMemcpyPeer(t, devIDT, s, devIDS, size) == cudaSuccess),
"cudaMemcpy error (cudaMemcpyDeviceToDevice)");
}*/
}
cudaSetDevice
(
devIDBackup
);
}
}
#else
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
...
@@ -270,23 +279,32 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
...
@@ -270,23 +279,32 @@ void XMemCopy2D(void * t, size_t tPitch, int devIDT, const void * s, size_t sPit
return
;
return
;
}
}
#ifdef USE_CUDA
#ifdef USE_CUDA
else
if
(
devIDT
>=
0
&&
devIDS
<
0
)
{
else
{
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyHostToDevice
);
int
devID
=
devIDT
<
0
?
devIDS
:
devIDT
;
if
(
error
!=
cudaSuccess
){
int
devIDBackup
=
0
;
ShowNTErrors
(
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"
);
cudaGetDevice
(
&
devIDBackup
);
cudaSetDevice
(
devID
);
if
(
devIDT
>=
0
&&
devIDS
<
0
)
{
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyHostToDevice
);
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy2D error (cudaMemcpyHostToDevice)"
);
}
}
}
}
else
if
(
devIDT
<
0
&&
devIDS
>=
0
)
{
else
if
(
devIDT
<
0
&&
devIDS
>=
0
)
{
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToHost
);
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToHost
);
if
(
error
!=
cudaSuccess
){
if
(
error
!=
cudaSuccess
){
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToHost)"
);
}
}
}
}
else
{
else
{
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToDevice
);
cudaError_t
error
=
cudaMemcpy2D
(
t
,
tPitch
,
s
,
sPitch
,
mSize
,
n
,
cudaMemcpyDeviceToDevice
);
if
(
error
!=
cudaSuccess
)
{
if
(
error
!=
cudaSuccess
)
{
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToDevice)"
);
ShowNTErrors
(
"cudaMemcpy error (cudaMemcpyDeviceToDevice)"
);
}
}
}
cudaSetDevice
(
devIDBackup
);
}
}
#else
#else
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
ShowNTErrors
(
"Please specify USE_CUDA and recompile the code!"
);
...
...
source/tensor/core/CHeader.h
查看文件 @
454bd870
...
@@ -64,7 +64,7 @@
...
@@ -64,7 +64,7 @@
#include "arithmetic/Sum.h"
#include "arithmetic/Sum.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnTV.h"
#include "arithmetic/SumByColumnVT.h"
#include "arithmetic/SumByColumnVT.h"
#include "arithmetic/Sum
Filled
.h"
#include "arithmetic/Sum
Dim
.h"
#include "sort/TopK.h"
#include "sort/TopK.h"
#include "shape/Transpose.h"
#include "shape/Transpose.h"
#include "shape/Unsqueeze.h"
#include "shape/Unsqueeze.h"
...
...
source/tensor/core/arithmetic/Sum.cpp
查看文件 @
454bd870
...
@@ -67,7 +67,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
...
@@ -67,7 +67,7 @@ void _Sum(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta)
}
}
else
{
else
{
if
(
!
a
->
isSparse
&&
!
b
->
isSparse
)
{
if
(
!
a
->
isSparse
&&
!
b
->
isSparse
)
{
CheckNTErrors
(
!
c
->
isSparse
,
"Illegal use of sparse
matrix
in addition!"
);
CheckNTErrors
(
!
c
->
isSparse
,
"Illegal use of sparse
tensor
in addition!"
);
if
(
a
->
dataType
==
DEFAULT_DTYPE
&&
if
(
a
->
dataType
==
DEFAULT_DTYPE
&&
b
->
dataType
==
DEFAULT_DTYPE
&&
b
->
dataType
==
DEFAULT_DTYPE
&&
...
...
source/tensor/core/arithmetic/SumDim.cpp
0 → 100644
查看文件 @
454bd870
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
*/
#include "Sum.h"
#include "SumDim.h"
#include "SumDim.cuh"
#include "../movement/CopyValues.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/*
tensor summation
c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void
_SumDim
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
int
n
,
DTYPE
beta
)
{
CheckNTErrors
(
a
&&
b
&&
c
,
"Empty tensor input!"
);
CheckNTErrors
(
a
->
unitNum
==
c
->
unitNum
,
"Unmatched tensors in addition!"
);
CheckNTErrors
(
a
->
dataType
==
b
->
dataType
&&
a
->
dataType
==
c
->
dataType
,
"Unmatched data types in addition!"
);
CheckNTErrors
(
a
->
order
==
c
->
order
,
"The input tensors do not have the same order in addition!"
);
CheckNTErrors
(
!
a
->
isSparse
&&
!
b
->
isSparse
&&
!
c
->
isSparse
,
"Dense tensors are required!"
);
CheckNTErrors
(
a
->
dimSize
[
n
]
==
b
->
unitNum
,
"Wrong tensor size!"
);
if
(
beta
==
0
){
_CopyValues
(
a
,
c
);
return
;
}
if
(
XTensor
::
IsSameShaped
(
a
,
b
)){
_Sum
(
a
,
b
,
c
,
beta
);
return
;
}
if
(
a
->
devID
>=
0
||
b
->
devID
>=
0
||
c
->
devID
>=
0
){
_CudaSumDim
(
a
,
b
,
c
,
n
,
beta
);
}
else
{
int
stride
=
1
;
int
blockSize
=
a
->
dimSize
[
n
];
int
blockNum
=
1
;
for
(
int
i
=
a
->
order
-
1
;
i
>=
0
;
i
--
){
if
(
i
>
n
)
stride
*=
a
->
dimSize
[
i
];
else
if
(
i
<
n
)
blockNum
*=
a
->
dimSize
[
i
];
}
if
(
a
->
dataType
==
DEFAULT_DTYPE
){
int
num
=
a
->
unitNum
;
if
(
stride
>
1
){
for
(
int
i
=
0
,
j
=
0
;
i
<
num
;
i
+=
stride
,
j
++
){
DTYPE
*
ap
=
(
DTYPE
*
)
a
->
data
+
i
;
DTYPE
bv
=
*
((
DTYPE
*
)
b
->
data
+
j
%
blockSize
)
*
beta
;
DTYPE
*
cp
=
(
DTYPE
*
)
c
->
data
+
i
;
for
(
int
k
=
0
;
k
<
stride
;
k
++
)
cp
[
k
]
=
ap
[
k
]
+
bv
;
}
}
else
if
(
stride
==
1
){
DTYPE
*
bp
=
(
DTYPE
*
)
b
->
data
;
for
(
int
i
=
0
;
i
<
num
;
i
+=
blockSize
){
DTYPE
*
ap
=
(
DTYPE
*
)
a
->
data
+
i
;
DTYPE
*
cp
=
(
DTYPE
*
)
c
->
data
+
i
;
if
(
beta
==
1.0
F
){
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
cp
[
j
]
=
ap
[
j
]
+
bp
[
j
];
}
else
{
for
(
int
j
=
0
;
j
<
blockSize
;
j
++
)
cp
[
j
]
=
ap
[
j
]
+
bp
[
j
]
*
beta
;
}
}
}
else
{
ShowNTErrors
(
"Something is wrong!"
);
}
}
else
{
ShowNTErrors
(
"TODO!"
);
}
}
}
}
source/tensor/core/arithmetic/SumDim.cu
0 → 100644
查看文件 @
454bd870
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
*/
#include "SumDim.cuh"
#include "../../XDevice.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
/*
tensor summation of a tensor and a row vector
c = a + b * \beta
where a is a tensor and b is a row vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c
>> colNum - number of columns of a and c (i.e., the size of b)
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int col = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
if(col >= colNum || row >= rowNum)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] = b[col];
__syncthreads();
int offset = colNum * row + col;
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.x] * beta;
else
c[offset] = a[offset] + bv[threadIdx.x];
}
/*
tensor summation of a tensor and a colum vector
c = a + b * \beta
where a is a tensor and b is a colum vector
>> a - pointer to the data array of a
>> b - pointer to the data array of b
>> c - pointer to the data array of c
>> rowNum - number of rows of a and c (i.e., the size of b)
>> colNum - number of columns of a and c
>> blockNum - size of a block (matrix), i.e., rowNum * colNum
>> blockNum - number of matrics
>> beta - the scaling factor
*/
template <class T, bool betaFired>
__global__
void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize, int blockNum, T beta)
{
__shared__ T bv[MAX_CUDA_THREAD_NUM_PER_BLOCK];
int colIndex = blockDim.x * blockIdx.x + threadIdx.x;
int row = blockDim.y * blockIdx.y + threadIdx.y;
int col = colIndex % colNum;
int block = colIndex / colNum;
if(row >= rowNum || block >= blockNum)
return;
if(threadIdx.x == 0)
bv[threadIdx.y] = b[row];
__syncthreads();
int offset = block * blockSize + row * colNum + col;
if(betaFired)
c[offset] = a[offset] + bv[threadIdx.y] * beta;
else
c[offset] = a[offset] + bv[threadIdx.y];
}
/*
tensor summation (cuda version)
c = a + b * \beta
where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting
>> a - a tensor
>> b - another tensor whose size is equal to that of dimension n of a
>> c - where we put a+b*\beta. we save it in a if c is NULL
>> n - the dimension index
>> beta - the scaling factor
*/
void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta)
{
CheckNTErrors(a && b && c, "Empty tensor input!");
CheckNTErrors(a->unitNum == c->unitNum, "Unmatched tensors in addition!");
CheckNTErrors(a->dataType == b->dataType && a->dataType == c->dataType,
"Unmatched data types in addition!");
CheckNTErrors(a->order == c->order, "The input tensors do not have the same order in addition!");
CheckNTErrors(!a->isSparse && !b->isSparse && !c->isSparse, "Dense tensors are required!");
CheckNTErrors(a->dimSize[n] == b->unitNum, "Wrong tensor size!");
int stride = 1;
int blockSize = a->dimSize[n];
int blockNum = 1;
for(int i = a->order - 1; i >= 0; i--){
if(i > n)
stride *= a->dimSize[i];
else if(i < n)
blockNum *= a->dimSize[i];
}
int cudaGrids[3];
int cudaBlocks[3];
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
if(beta == (DTYPE)1.0F)
KernelAddWithCol<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
else
KernelAddWithCol<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockSize, stride, blockSize * stride, blockNum, beta);
}
else if(stride == 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNum, MAX_INT, cudaGrids, cudaBlocks);
if(beta == (DTYPE)1.0F)
KernelAddWithRow<DTYPE, false> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
else
KernelAddWithRow<DTYPE, true> <<<dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1])>>>
((DTYPE*)a->data, (DTYPE*)b->data, (DTYPE*)c->data,
blockNum, blockSize, beta);
}
else{
ShowNTErrors("Something is wrong!");
}
}
else {
ShowNTErrors("TODO!");
}
}
} // namespace nts(NiuTrans.Tensor)
source/tensor/core/arithmetic/Sum
Filled.cu
→
source/tensor/core/arithmetic/Sum
Dim.cuh
查看文件 @
454bd870
...
@@ -16,13 +16,20 @@
...
@@ -16,13 +16,20 @@
*/
*/
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-2
8
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-2
9
*/
*/
#include "SumFilled.cuh"
#ifndef __SUMDIM_CUH__
#define __SUMDIM_CUH__
#include "../../XTensor.h"
namespace nts { // namespace nts(NiuTrans.Tensor)
namespace nts { // namespace nts(NiuTrans.Tensor)
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting (cuda version) */
void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE beta = (DTYPE)1.0);
} // namespace nts(NiuTrans.Tensor)
} // namespace nts(NiuTrans.Tensor)
#endif // __SUMDIM_CUH__
source/tensor/core/arithmetic/Sum
Filled.cu
h
→
source/tensor/core/arithmetic/Sum
Dim.
h
查看文件 @
454bd870
/* NiuTrans.Tensor - an open-source tensor library
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
* All rights reserved.
*
*
* Licensed under the Apache License, Version 2.0 (the "License");
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* You may obtain a copy of the License at
*
*
* http://www.apache.org/licenses/LICENSE-2.0
* http://www.apache.org/licenses/LICENSE-2.0
*
*
* Unless required by applicable law or agreed to in writing, software
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* See the License for the specific language governing permissions and
* limitations under the License.
* limitations under the License.
*/
*/
/*
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-29
*/
* It reached to 39 centigrade around 3:00 pm in Shenyang
*/
#ifndef __SUM
FILLED_CU
H__
#ifndef __SUM
DIM_
H__
#define __SUM
FILLED_CU
H__
#define __SUM
DIM_
H__
#include "../../XTensor.h"
#include "../../XTensor.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
a value of 1, i.e., a is summed with b by broadcasting */
i.e., a is summed with b by broadcasting */
void _CudaSumFilled(const XTensor * a, const XTensor * b, XTensor * c, DTYPE beta = (DTYPE)1.0);
void
_SumDim
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting. we keep the result in the input tensor a and return nothing */
void
_SumDim
(
XTensor
*
a
,
const
XTensor
*
b
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor summation c = a + b * \beta where the size of b is equal to the n-th dimension of a,
i.e., a is summed with b by broadcasting. We make a new tensor c to keep the result and return it */
XTensor
SumDim
(
const
XTensor
&
a
,
const
XTensor
&
b
,
int
n
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
}
// namespace nts(NiuTrans.Tensor)
}
// namespace nts(NiuTrans.Tensor)
#endif // __SUM
FILLED_CU
H__
#endif // __SUM
DIM_
H__
source/tensor/core/arithmetic/SumFilled.cpp
deleted
100644 → 0
查看文件 @
dcba416c
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
*/
#include "SumFilled.h"
#include "SumFilled.cuh"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
}
source/tensor/core/arithmetic/SumFilled.h
deleted
100644 → 0
查看文件 @
dcba416c
/* NiuTrans.Tensor - an open-source tensor library
* Copyright (C) 2018, Natural Language Processing Lab, Northestern University.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* $Created by: XIAO Tong (email: xiaotong@mail.neu.edu.cn) 2018-07-28
*/
#ifndef __SUMFILLED_H__
#define __SUMFILLED_H__
#include "../../XTensor.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
a value of 1, i.e., a is summed with b by broadcasting */
void
_SumFilled
(
const
XTensor
*
a
,
const
XTensor
*
b
,
XTensor
*
c
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
a value of 1, i.e., a is summed with b by broadcasting
keep the result in the input tensor a and return nothing */
void
_SumFilledMe
(
XTensor
*
a
,
const
XTensor
*
b
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
/* tensor summation c = a + b * \beta where each dimension of b is equal to that of a or has
a value of 1, i.e., a is summed with b by broadcasting
make a new tensor c to keep the result and return it */
XTensor
SumFilled
(
const
XTensor
&
a
,
const
XTensor
&
b
,
DTYPE
beta
=
(
DTYPE
)
1
.
0
);
}
// namespace nts(NiuTrans.Tensor)
#endif // __SUMFILLED_H__
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论