Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
b3fa7a37
Commit
b3fa7a37
authored
Nov 28, 2018
by
xuchen
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
cpu implementation of the gather and spread function
parent
2fea6615
隐藏空白字符变更
内嵌
并排
正在显示
18 个修改的文件
包含
336 行增加
和
111 行删除
+336
-111
source/network/XBackwardShape.cpp
+1
-1
source/sample/transformer/T2TTrainer.cpp
+5
-4
source/tensor/XTensor.cpp
+12
-1
source/tensor/XTensor.h
+3
-0
source/tensor/core/getandset/SetData.cpp
+48
-2
source/tensor/core/getandset/SetData.cu
+49
-2
source/tensor/core/getandset/SetData.cuh
+4
-0
source/tensor/core/getandset/SetData.h
+1
-1
source/tensor/core/math/Unary.cu
+7
-3
source/tensor/core/math/Unary.h
+1
-1
source/tensor/core/movement/Gather.cpp
+18
-0
source/tensor/core/movement/Gather.cu
+10
-10
source/tensor/core/movement/Spread.cpp
+29
-20
source/tensor/core/movement/Spread.cu
+8
-58
source/tensor/core/movement/Spread.cuh
+6
-1
source/tensor/core/movement/Spread.h
+4
-3
source/tensor/test/TGather.cpp
+6
-4
source/tensor/test/TSpread.cpp
+124
-0
没有找到文件。
source/network/XBackwardShape.cpp
查看文件 @
b3fa7a37
...
...
@@ -109,7 +109,7 @@ void XShapeGrad::GradCopyIndexed(XTensor * node, bool isEfficent)
XTensor
*
input
=
income
.
tails
[
0
];
XNoder
::
MakeGrad
(
input
);
_Spread
ForGather
(
input
->
grad
,
node
->
grad
,
dim
,
realSrcIndex
,
realIndexSize
,
realTgtIndex
);
_Spread
(
input
->
grad
,
node
->
grad
,
dim
,
realSrcIndex
,
realIndexSize
,
realTgtIndex
);
delete
[]
realSrcIndex
;
delete
[]
realTgtIndex
;
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
b3fa7a37
...
...
@@ -728,13 +728,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
int
seqSize
=
0
;
MTYPE
*
batchEncOffsets
=
new
MTYPE
[
batchEnc
->
unitNum
];
int
*
batchEncValues
=
new
int
[
batchEnc
->
unitNum
];
MTYPE
*
paddingEncOffsets
=
new
MTYPE
[
paddingEnc
->
unitNum
];
MTYPE
*
goldOffsets
=
new
MTYPE
[
gold
->
unitNum
];
MTYPE
*
paddingDecOffsets
=
new
MTYPE
[
paddingDec
->
unitNum
];
/* need to improve the implementation */
memset
(
batchEncValues
,
0
,
sizeof
(
int
)
*
batchEnc
->
unitNum
);
int
wGold
=
0
;
//fprintf(tf, "batch %d(%d)\n", tc++, sc);
...
...
@@ -745,9 +744,10 @@ int T2TTrainer::LoadBatchLM(FILE * file,
for
(
int
w
=
0
;
w
<
len
;
w
++
){
int
num
=
buf
[
seqOffset
[
s
]
+
w
];
//batchEnc->Set2DInt(buf[seqOffset[s] + w], s - seq, w);
batchEncOffsets
[
wCount
]
=
batchEnc
->
GetOffset2D
(
s
-
seq
,
w
);
batchEncValues
[
wCount
]
=
num
;
//paddingEnc->Set2D(1.0F, s - seq, w);
//paddingDec->Set2D(1.0F, s - seq, w);
batchEncValues
[(
s
-
seq
)
*
dims
[
1
]
+
w
]
=
num
;
paddingEncOffsets
[
wCount
]
=
paddingEnc
->
GetOffset2D
(
s
-
seq
,
w
);
paddingDecOffsets
[
wCount
]
=
paddingDec
->
GetOffset2D
(
s
-
seq
,
w
);
if
(
w
>
0
)
...
...
@@ -779,11 +779,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
}
}
batchEnc
->
SetData
(
batchEncValues
,
batchEnc
->
unitNum
);
batchEnc
->
SetData
Batched
(
batchEncOffsets
,
batchEncValues
,
wCount
);
paddingEnc
->
SetDataBatched
(
paddingEncOffsets
,
1.0
F
,
wCount
);
paddingDec
->
SetDataBatched
(
paddingDecOffsets
,
1.0
F
,
wCount
);
gold
->
SetDataBatched
(
goldOffsets
,
1.0
F
,
wGold
);
delete
[]
batchEncOffsets
;
delete
[]
batchEncValues
;
delete
[]
paddingEncOffsets
;
delete
[]
paddingDecOffsets
;
...
...
source/tensor/XTensor.cpp
查看文件 @
b3fa7a37
...
...
@@ -790,7 +790,7 @@ void XTensor::SetDataRandn(DTYPE mean, DTYPE standardDeviation)
/*
set tensor items with an array of offsets
>> offsets - offset for each data item
>> value
s - value for each data item
>> value
- value for data items
>> num - number of the data items
*/
void
XTensor
::
SetDataBatched
(
MTYPE
*
offsets
,
DTYPE
value
,
int
num
)
...
...
@@ -798,6 +798,17 @@ void XTensor::SetDataBatched(MTYPE * offsets, DTYPE value, int num)
_SetDataWithOffset
(
this
,
offsets
,
value
,
num
);
}
/*
set tensor items with an array of values
>> offsets - offset for each data item
>> values - value for each data item
>> num - number of the data items
*/
void
XTensor
::
SetDataBatched
(
MTYPE
*
offsets
,
void
*
values
,
int
num
)
{
_SetDataWithOffsetAndValue
(
this
,
offsets
,
values
,
num
);
}
/* check whether the data array is the same as the answer
>> d - input data. it must be on CPU
>> num - number of data items
...
...
source/tensor/XTensor.h
查看文件 @
b3fa7a37
...
...
@@ -282,6 +282,9 @@ public:
/* set tensor items with an array of offsets */
void
SetDataBatched
(
MTYPE
*
offsets
,
DTYPE
value
,
int
num
);
/* set tensor items with an array of values */
void
SetDataBatched
(
MTYPE
*
offsets
,
void
*
values
,
int
num
);
/* check whether the data array is the same as the answer */
bool
CheckData
(
const
void
*
answer
,
int
num
,
int
beg
=
0
);
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
b3fa7a37
...
...
@@ -485,9 +485,55 @@ set the data with an array of values
>> values - value for each data item
>> num - number of the data items
*/
void
_SetDataWithOffsetAndValue
(
XTensor
*
tensor
,
MTYPE
*
offsets
,
DTYPE
*
values
,
MTYPE
num
)
void
_SetDataWithOffsetAndValue
(
XTensor
*
tensor
,
MTYPE
*
offsets
,
void
*
values
,
MTYPE
num
)
{
ShowNTErrors
(
"TODO!"
);
if
(
tensor
->
devID
<
0
)
{
for
(
int
i
=
0
;
i
<
num
;
i
++
)
{
if
(
tensor
->
dataType
==
X_INT
)
*
((
int
*
)
tensor
->
data
+
offsets
[
i
])
=
*
((
int
*
)
values
+
i
);
else
if
(
tensor
->
dataType
==
X_FLOAT
)
*
((
float
*
)
tensor
->
data
+
offsets
[
i
])
=
*
((
float
*
)
values
+
i
);
else
ShowNTErrors
(
"TO DO!!!"
);
}
}
else
{
#ifdef USE_CUDA
XMem
*
mem
=
tensor
->
mem
;
MTYPE
offsetSize
=
num
*
sizeof
(
MTYPE
);
MTYPE
valueSize
;
if
(
tensor
->
dataType
==
X_INT
)
valueSize
=
num
*
sizeof
(
int
);
else
if
(
tensor
->
dataType
==
X_FLOAT
)
valueSize
=
num
*
sizeof
(
float
);
else
ShowNTErrors
(
"TO DO!!!"
);
MTYPE
*
offsetsCuda
=
mem
!=
NULL
?
(
MTYPE
*
)
mem
->
AllocBuf
(
mem
->
devID
,
offsetSize
)
:
(
MTYPE
*
)
XMemAlloc
(
tensor
->
devID
,
offsetSize
);
void
*
valuesCuda
=
mem
!=
NULL
?
mem
->
AllocBuf
(
mem
->
devID
,
valueSize
)
:
XMemAlloc
(
tensor
->
devID
,
valueSize
);
XMemCopy
(
offsetsCuda
,
tensor
->
devID
,
offsets
,
-
1
,
offsetSize
);
XMemCopy
(
valuesCuda
,
tensor
->
devID
,
values
,
-
1
,
valueSize
);
_CudaSetDataWithOffsetAndValue
(
tensor
,
offsetsCuda
,
valuesCuda
,
num
);
if
(
mem
!=
NULL
)
{
mem
->
ReleaseBuf
(
mem
->
devID
,
valueSize
);
mem
->
ReleaseBuf
(
mem
->
devID
,
offsetSize
);
}
else
{
XMemFree
(
tensor
->
devID
,
offsetsCuda
);
XMemFree
(
tensor
->
devID
,
valuesCuda
);
}
#else
ShowNTErrors
(
"Please recompile the code with USE_CUDA"
);
#endif
}
}
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/getandset/SetData.cu
查看文件 @
b3fa7a37
...
...
@@ -470,8 +470,8 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> num - number of the data items
>> value - value of the data items
>> num - number of the data items
*/
__global__
void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE num)
...
...
@@ -487,8 +487,8 @@ void _KernelSetDataWithOffset(DTYPE * data, MTYPE * offsets, DTYPE value, MTYPE
set the data with an array of offsets (cuda version)
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> num - number of the data items
>> value - value of the data items
>> num - number of the data items
*/
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num)
{
...
...
@@ -510,4 +510,51 @@ void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYP
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
set the data with an array of offsets (kernel version)
>> data - pointer to the data array
>> offsets - offset for each data item
>> value - value of the data items
>> num - number of the data items
>> dataType - the data type of the data and values
*/
__global__
void _KernelSetDataWithOffset(void * data, MTYPE * offsets, void * values, MTYPE num, TENSOR_DATA_TYPE dataType)
{
/* index */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < num) {
if (dataType == X_INT)
*((int *)data + offsets[i]) = *((int *)values + i);
else if (dataType == X_FLOAT)
*((float *)data + offsets[i]) = *((float *)values + i);
}
}
/*
set the data with an array of values
>> tensor - the tensor that keeps the data
>> offsets - offset for each data item
>> value - value of the ech data item
>> num - number of the data items
*/
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * values, MTYPE num)
{
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, (int)num, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
_KernelSetDataWithOffset << <blocks, threads >> > (tensor->data, offsets, values, num, tensor->dataType);
BacktoCudaDev(tensor->devID, devIDBackup);
}
} // namespace nts(NiuTrans.Tensor)
source/tensor/core/getandset/SetData.cuh
查看文件 @
b3fa7a37
...
...
@@ -52,6 +52,9 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
/* set the data with an array of values */
void _CudaSetDataWithOffsetAndValue(XTensor * tensor, MTYPE * offsets, void * value, MTYPE num);
} // namespace nts(NiuTrans.Tensor)
#endif // __SETDATA_CUH__
\ No newline at end of file
source/tensor/core/getandset/SetData.h
查看文件 @
b3fa7a37
...
...
@@ -64,7 +64,7 @@ void _SetDataRandN(XTensor * tensor, DTYPE mean = 0.0F, DTYPE standardDeviation
void
_SetDataWithOffset
(
XTensor
*
tensor
,
MTYPE
*
offsets
,
DTYPE
value
,
MTYPE
num
);
/* set the data with an array of values */
void
_SetDataWithOffsetAndValue
(
XTensor
*
tensor
,
MTYPE
*
offsets
,
DTYPE
*
values
,
MTYPE
num
);
void
_SetDataWithOffsetAndValue
(
XTensor
*
tensor
,
MTYPE
*
offsets
,
void
*
values
,
MTYPE
num
);
}
// namespace nts(NiuTrans.Tensor)
...
...
source/tensor/core/math/Unary.cu
查看文件 @
b3fa7a37
...
...
@@ -26,7 +26,9 @@
#include "Unary.h"
#include "Unary.cuh"
namespace nts {
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
__device__
DTYPE cudasquare(DTYPE x)
...
...
@@ -114,4 +116,6 @@ SIMPLE_UNARY_FUNCTION_GPU(Sin, sin)
SIMPLE_UNARY_FUNCTION_GPU(Cos, cos)
SIMPLE_UNARY_FUNCTION_GPU(Tan, tan)
}
\ No newline at end of file
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
\ No newline at end of file
source/tensor/core/math/Unary.h
查看文件 @
b3fa7a37
...
...
@@ -25,7 +25,7 @@
#include "../../XTensor.h"
namespace
nts
{
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
/* set every entry to its absolute value */
void
_Absolute
(
const
XTensor
*
a
,
XTensor
*
b
);
...
...
source/tensor/core/movement/Gather.cpp
查看文件 @
b3fa7a37
...
...
@@ -70,6 +70,24 @@ void _Gather(XTensor * s, XTensor * t, XTensor * srcIndex)
return
;
}
#endif
int
blockNum
=
1
;
int
blockSize
=
1
;
int
stride
=
1
;
int
indexSize
=
1
;
stride
=
s
->
GetDim
(
-
1
);
indexSize
=
srcIndex
->
unitNum
;
DTYPE
*
sData
=
(
DTYPE
*
)
s
->
data
;
DTYPE
*
tData
=
(
DTYPE
*
)
t
->
data
;
int
*
sIndexData
=
(
int
*
)
srcIndex
->
data
;
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
{
int
sIndex
=
sIndexData
[
i
]
*
stride
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
tData
[
i
*
stride
+
j
]
=
sData
[
sIndex
+
j
];
}
}
/*
...
...
source/tensor/core/movement/Gather.cu
查看文件 @
b3fa7a37
...
...
@@ -31,14 +31,14 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
/*
gather indexed sub-tensors(cuda version)
>> s
ource
- the data pointer of the source tensor
>> t
arget
- the data pointer of the target tensor
>> s
rc
Index - the index of the source tensor
>> s
Data
- the data pointer of the source tensor
>> t
Data
- the data pointer of the target tensor
>> sIndex - the index of the source tensor
>> indexSize - the size of the srcIndex
>> stride - stride of a data block
*/
__global__
void KernelGather(DTYPE * s
ource, DTYPE * target, int * src
Index, int indexSize, int stride)
void KernelGather(DTYPE * s
Data, DTYPE * tData, int * s
Index, int indexSize, int stride)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
...
...
@@ -53,8 +53,8 @@ void KernelGather(DTYPE * source, DTYPE * target, int * srcIndex, int indexSize,
return;
if(threadIdx.y == 0){
sp[threadIdx.x] = s
ource + src
Index[i] * stride;
cp[threadIdx.x] = t
arget
+ i * stride;
sp[threadIdx.x] = s
Data + s
Index[i] * stride;
cp[threadIdx.x] = t
Data
+ i * stride;
}
__syncthreads();
...
...
@@ -90,12 +90,12 @@ void _CudaGather(XTensor * s, XTensor * t, XTensor * srcIndex)
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
DTYPE * s
ource
= (DTYPE*)s->data;
DTYPE * t
arget
= (DTYPE*)t->data;
DTYPE * s
Data
= (DTYPE*)s->data;
DTYPE * t
Data
= (DTYPE*)t->data;
int * s
i
= (int *)srcIndex->data;
int * s
Index
= (int *)srcIndex->data;
KernelGather<<<blocks, threads >>>(s
ource, target, si
, indexSize, stride);
KernelGather<<<blocks, threads >>>(s
Data, tData, sIndex
, indexSize, stride);
BacktoCudaDev(devID, devIDBackup);
}
...
...
source/tensor/core/movement/Spread.cpp
查看文件 @
b3fa7a37
...
...
@@ -143,31 +143,27 @@ And this is a special spread function for backward computation of gather functio
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
*/
void
_SpreadForGather
(
XTensor
*
source
,
XTensor
*
collection
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
collIndex
)
int
*
srcIndex
,
int
indexSize
)
{
int
order
=
source
->
order
;
CheckNTErrors
(
source
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
CheckNTErrors
(
dim
>=
0
&&
dim
<
order
,
"Illegal dimension!"
);
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
dim
)
{
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
source
->
GetDim
(
i
)
,
"Illegal dimension!"
);
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
==
dim
)
{
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
indexSize
,
"Illegal dimension!"
);
}
else
if
(
i
>
dim
)
{
else
{
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
source
->
GetDim
(
i
),
"Illegal dimension!"
);
}
else
{
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
indexSize
,
"Illegal dimension!"
);
}
}
#ifdef USE_CUDA
if
(
source
->
devID
>=
0
&&
collection
->
devID
>=
0
)
{
_CudaSpreadForGather
(
source
,
collection
,
dim
,
srcIndex
,
indexSize
,
collIndex
);
_CudaSpreadForGather
(
source
,
collection
,
dim
,
srcIndex
,
indexSize
);
return
;
}
#endif
...
...
@@ -190,7 +186,7 @@ void _SpreadForGather(XTensor * source, XTensor * collection, int dim,
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
){
int
src
=
srcIndex
[
i
];
int
tgt
=
collIndex
[
i
]
;
int
tgt
=
i
;
DTYPE
*
s
=
sData
+
src
*
stride
;
DTYPE
*
c
=
cData
+
tgt
*
stride
;
_AssignmentForGather
(
s
,
c
,
blockNum
,
blockSizeSrc
,
blockSizeColl
,
stride
);
...
...
@@ -203,12 +199,7 @@ And this is a special spread function for backward computation of gather functio
>> source - the source tensor whose data would be modified
>> collection - the collection whose data would be spread to source tensor
>> dim - the leading dimension to define "sub-tensors"
e.g., for a tensor of size (3, 2, 4) and dim = 2,
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
>> index - the tensor to save the index of the collenction tensor
*/
void
_SpreadForGather
(
XTensor
*
source
,
XTensor
*
collection
,
XTensor
*
index
)
{
...
...
@@ -218,10 +209,10 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
CheckNTErrors
(
source
->
dataType
==
DEFAULT_DTYPE
,
"TODO!"
);
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
dim
){
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
source
->
GetDim
(
i
)
,
"Illegal dimension!"
);
if
(
i
==
dim
){
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
index
->
unitNum
,
"Illegal dimension!"
);
}
else
if
(
i
>
dim
)
{
else
{
CheckNTErrors
(
collection
->
GetDim
(
i
)
==
source
->
GetDim
(
i
),
"Illegal dimension!"
);
}
}
...
...
@@ -232,7 +223,24 @@ void _SpreadForGather(XTensor * source, XTensor * collection, XTensor * index)
return
;
}
#endif
int
blockNum
=
1
;
int
blockSize
=
1
;
int
stride
=
1
;
int
indexSize
=
1
;
stride
=
source
->
GetDim
(
-
1
);
indexSize
=
index
->
unitNum
;
DTYPE
*
sData
=
(
DTYPE
*
)
source
->
data
;
DTYPE
*
cData
=
(
DTYPE
*
)
collection
->
data
;
int
*
sIndexData
=
(
int
*
)
index
->
data
;
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
{
int
sIndex
=
sIndexData
[
i
]
*
stride
;
for
(
int
j
=
0
;
j
<
stride
;
j
++
)
sData
[
sIndex
+
j
]
+=
cData
[
i
*
stride
+
j
];
}
}
}
//
namespace
nts
(
NiuTrans
.
Tensor
)
\ No newline at end of file
source/tensor/core/movement/Spread.cu
查看文件 @
b3fa7a37
...
...
@@ -29,6 +29,8 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/*
This is core assignment for spread function.
...
...
@@ -240,54 +242,6 @@ Care of the operator "+=" instead of "=".
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
>> colIndex - index of the sub-tensor in the collection tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
int blockSizeSrc, int blockSizeColl, int stride,
int subtensorNum,
int * srcIndex, int * colIndex)
{
__shared__ DTYPE * sp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ DTYPE * cp[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* block id */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* offset in each block */
int offset = blockDim.y * blockIdx.y + threadIdx.y;
int blockId = i % blockNum;
int subtensorId = i / blockNum;
if(subtensorId >= subtensorNum || offset >= stride)
return;
if(threadIdx.y == 0){
sp[threadIdx.x] = sData + srcIndex[subtensorId] * stride;
cp[threadIdx.x] = cData + colIndex[subtensorId] * stride;
}
__syncthreads();
DTYPE * s = sp[threadIdx.x] + blockSizeSrc * blockId;
DTYPE * c = cp[threadIdx.x] + blockSizeColl * blockId;
s[offset] += c[offset];
}
/*
This is core assignment for backward computation of gather function.
Care of the operator "+=" instead of "=".
>> sData - the data pointer of the source tensor
>> cData - the data pointer of collection tensor
>> blockNum - number of data blocks
>> blockSizeSrc - size of source data block
>> blockSizeColl - size of source data block
>> stride - stride of a data block
>> subtensorNum - number of sub-tensors
>> srcIndex - index of the source sub-tensor
*/
__global__
void KernelSpreadForGatherFuzed(DTYPE * sData, DTYPE * cData, int blockNum,
...
...
@@ -334,10 +288,9 @@ And this is a special spread function for backward computation of gather functio
we have 4 sub-tensors of size (3, 2)
>> srcIndex - index of the source sub-tensors
>> indexSize - length of srcIndex (and collIndex)
>> collIndex - index of the gathered sub-tensors
*/
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize
, int * collIndex
)
int * srcIndex, int indexSize)
{
int order = source->order;
...
...
@@ -372,7 +325,7 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
DTYPE * cData = (DTYPE*)collection->data;
for(int i = 0; i < indexSize; i++) {
int src = srcIndex[i];
int tgt =
collIndex[i]
;
int tgt =
i
;
DTYPE * s = sData + src * stride;
DTYPE * c = cData + tgt * stride;
...
...
@@ -384,12 +337,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * si = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(source->devID, sizeof(int) * indexSize);
int * ci = mem != NULL ?
(int*)mem->AllocBuf(mem->devID, sizeof(int) * indexSize) :
(int*)XMemAlloc(collection->devID, sizeof(int) * indexSize);
XMemCopy(si, source->devID, srcIndex, -1, sizeof(int) * indexSize);
XMemCopy(ci, collection->devID, collIndex, -1, sizeof(int) * indexSize);
DTYPE * s = (DTYPE*)source->data;
DTYPE * c = (DTYPE*)collection->data;
...
...
@@ -399,15 +348,13 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
dim3 blocks(cudaGrids[0], cudaGrids[1]);
dim3 threads(cudaBlocks[0], cudaBlocks[1]);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si
, ci
);
KernelSpreadForGatherFuzed<<<blocks, threads >>>(s, c, blockNum, blockSizeSrc, blockSizeColl, stride, indexSize, si);
if (mem != NULL) {
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
mem->ReleaseBuf(mem->devID, sizeof(int) * indexSize);
}
else {
XMemFree(source->devID, si);
XMemFree(collection->devID, ci);
}
}
}
...
...
@@ -454,6 +401,8 @@ void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcI
BacktoCudaDev(source->devID, devIDBackup);
}
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SPREAD_CUH__
\ No newline at end of file
source/tensor/core/movement/Spread.cuh
查看文件 @
b3fa7a37
...
...
@@ -26,17 +26,21 @@
namespace nts { // namespace nts(NiuTrans.Tensor)
#ifdef USE_CUDA
/* spread a collection tensor to source tensor (cuda version) */
void _CudaSpread(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize, int * collIndex);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, int dim,
int * srcIndex, int indexSize
, int * collIndex
);
int * srcIndex, int indexSize);
/* special spread function for backward computation of gather function (cuda version) */
void _CudaSpreadForGather(XTensor * source, XTensor * collection, XTensor * srcIndex);
#endif // USE_CUDA
} // namespace nts(NiuTrans.Tensor)
#endif // __SPREAD_CUH__
\ No newline at end of file
source/tensor/core/movement/Spread.h
查看文件 @
b3fa7a37
...
...
@@ -32,12 +32,13 @@ void _Spread(XTensor * source, XTensor * collection, int dim,
/* spread a collection tensor to source tensor (return a XTensor structure)
make a new tensor to keep the result and return it */
void
Spread
(
XTensor
*
source
,
XTensor
*
collection
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
collIndex
);
void
Spread
(
XTensor
*
source
,
XTensor
*
collection
,
XTensor
*
srcIndex
,
XTensor
*
collIndex
,
int
dim
);
/* special spread function for backward computation of gather function */
void
_SpreadForGather
(
XTensor
*
source
,
XTensor
*
collection
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
collIndex
);
int
*
srcIndex
,
int
indexSize
);
/* special spread function for backward computation of gather function */
void
_SpreadForGather
(
XTensor
*
source
,
XTensor
*
collection
,
XTensor
*
index
);
...
...
source/tensor/test/TGather.cpp
查看文件 @
b3fa7a37
...
...
@@ -202,7 +202,7 @@ bool TestGather2()
_Gather
(
sGPU
,
tGPU
,
dim
,
srcIndex
,
indexSize
);
/* check results */
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
)
&&
tUserGPU
.
CheckData
(
answer
,
tUnitNum
)
;
gpuTest
=
tGPU
->
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
s
;
...
...
@@ -287,11 +287,11 @@ bool TestGather3()
/* call Gather function */
_Gather
(
s
,
t
,
dim
,
srcIndex
,
indexSize
);
//
tUser = Gather(*s, *index);
tUser
=
Gather
(
*
s
,
*
index
);
/* check results */
cpuTest
=
t
->
CheckData
(
answer
,
tUnitNum
)
;
//tUser2
.CheckData(answer, tUnitNum);
cpuTest
=
t
->
CheckData
(
answer
,
tUnitNum
)
&&
tUser
.
CheckData
(
answer
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
...
...
@@ -325,6 +325,7 @@ bool TestGather3()
delete
indexGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
&&
gpuTest
;
#else
...
...
@@ -333,6 +334,7 @@ bool TestGather3()
delete
t
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
;
#endif // USE_CUDA
...
...
source/tensor/test/TSpread.cpp
查看文件 @
b3fa7a37
...
...
@@ -133,6 +133,130 @@ bool TestSpread1()
#endif // USE_CUDA
}
/*
case 2: test _SpreadForGather function
spread a collection tensor to source tensor
*/
bool
TestSpread2
()
{
/* a input tensor of size (3, 3) */
int
sOrder
=
2
;
int
*
sDimSize
=
new
int
[
sOrder
];
sDimSize
[
0
]
=
3
;
sDimSize
[
1
]
=
3
;
int
sUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
sOrder
;
i
++
)
sUnitNum
*=
sDimSize
[
i
];
/* a output tensor of size (2, 3) */
int
tOrder
=
2
;
int
*
tDimSize
=
new
int
[
tOrder
];
tDimSize
[
0
]
=
2
;
tDimSize
[
1
]
=
3
;
int
tUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
tOrder
;
i
++
)
tUnitNum
*=
tDimSize
[
i
];
/* a index tensor of size (2) */
int
indexOrder
=
1
;
int
*
indexDimSize
=
new
int
[
indexOrder
];
indexDimSize
[
0
]
=
2
;
int
indexUnitNum
=
1
;
for
(
int
i
=
0
;
i
<
indexOrder
;
i
++
)
indexUnitNum
*=
indexDimSize
[
i
];
DTYPE
sData
[
3
][
3
]
=
{
{
0.0
F
,
0.0
F
,
2.0
F
},
{
2.0
F
,
1.0
F
,
3.0
F
},
{
2.0
F
,
2.0
F
,
4.0
F
}
};
DTYPE
tData
[
2
][
3
]
=
{
{
0.0
F
,
-
1.0
F
,
2.0
F
},
{
1.0
F
,
2.0
F
,
0.0
F
}
};
DTYPE
answer
[
3
][
3
]
=
{
{
0.0
F
,
-
1.0
F
,
4.0
F
},
{
2.0
F
,
1.0
F
,
3.0
F
},
{
3.0
F
,
4.0
F
,
4.0
F
}
};
int
dim
=
0
;
int
indexSize
=
2
;
int
srcIndex
[
2
]
=
{
0
,
2
};
/* CPU test */
bool
cpuTest
=
true
;
/* create tensors */
XTensor
*
s1
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
s2
=
NewTensor
(
sOrder
,
sDimSize
);
XTensor
*
t
=
NewTensor
(
tOrder
,
tDimSize
);
XTensor
*
index
=
NewTensor
(
indexOrder
,
indexDimSize
,
X_INT
);
/* initialize variables */
s1
->
SetData
(
sData
,
sUnitNum
);
s2
->
SetData
(
sData
,
sUnitNum
);
t
->
SetData
(
tData
,
tUnitNum
);
index
->
SetData
(
srcIndex
,
indexSize
);
/* call _SpreadForGather function */
_SpreadForGather
(
s1
,
t
,
dim
,
srcIndex
,
indexSize
);
_SpreadForGather
(
s2
,
t
,
index
);
/* check results */
cpuTest
=
s1
->
CheckData
(
answer
,
tUnitNum
)
&&
s2
->
CheckData
(
answer
,
tUnitNum
);
#ifdef USE_CUDA
/* GPU test */
bool
gpuTest
=
true
;
/* create tensors */
XTensor
*
sGPU1
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
sGPU2
=
NewTensor
(
sOrder
,
sDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
tGPU
=
NewTensor
(
sOrder
,
tDimSize
,
X_FLOAT
,
1.0
F
,
0
);
XTensor
*
indexGPU
=
NewTensor
(
indexOrder
,
indexDimSize
,
X_INT
,
1.0
F
,
0
);
/* initialize variables */
sGPU1
->
SetData
(
sData
,
sUnitNum
);
sGPU2
->
SetData
(
sData
,
sUnitNum
);
tGPU
->
SetData
(
tData
,
tUnitNum
);
indexGPU
->
SetData
(
srcIndex
,
indexSize
);
/* call _SpreadForGather function */
_SpreadForGather
(
sGPU1
,
tGPU
,
dim
,
srcIndex
,
indexSize
);
_SpreadForGather
(
sGPU2
,
tGPU
,
indexGPU
);
/* check results */
gpuTest
=
sGPU1
->
CheckData
(
answer
,
tUnitNum
)
&&
sGPU2
->
CheckData
(
answer
,
tUnitNum
);
/* destroy variables */
delete
s1
;
delete
s2
;
delete
t
;
delete
index
;
delete
sGPU1
;
delete
sGPU2
;
delete
tGPU
;
delete
indexGPU
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
&&
gpuTest
;
#else
/* destroy variables */
delete
s1
;
delete
s2
;
delete
t
;
delete
[]
sDimSize
;
delete
[]
tDimSize
;
delete
[]
indexDimSize
;
return
cpuTest
;
#endif // USE_CUDA
}
/* other cases */
/*
TODO!!
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论