Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
8
Issues
8
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
NiuTrans
NiuTrans.Tensor
Commits
a26caf40
Commit
a26caf40
authored
Jul 30, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
better code of SumDim
parent
daa2f801
隐藏空白字符变更
内嵌
并排
正在显示
12 个修改的文件
包含
361 行增加
和
46 行删除
+361
-46
source/network/Main.cpp
+2
-2
source/network/XBackwardMath.cpp
+86
-0
source/network/XBackwardMath.h
+5
-0
source/sample/fnnlm/FNNLM.cpp
+3
-7
source/tensor/XName.cpp
+2
-0
source/tensor/XTensor.cpp
+30
-7
source/tensor/XTensor.h
+6
-3
source/tensor/core/arithmetic/Sum.cpp
+49
-5
source/tensor/core/arithmetic/SumDim.cpp
+1
-1
source/tensor/core/arithmetic/SumDim.cu
+7
-2
source/tensor/core/movement/CopyIndexed.cpp
+12
-6
source/tensor/core/shape/Unsqueeze.cu
+158
-13
没有找到文件。
source/network/Main.cpp
查看文件 @
a26caf40
...
...
@@ -41,8 +41,8 @@ int main( int argc, const char ** argv )
//TransposeTest();
//return 0;
SumDimTest
();
return
0
;
//
SumDimTest();
//
return 0;
if
(
argc
>
1
&&
!
strcmp
(
argv
[
1
],
"-test"
))
1
;
//Test();
...
...
source/network/XBackwardMath.cpp
查看文件 @
a26caf40
...
...
@@ -37,6 +37,8 @@ void XMathGrad::MakeGrad(XTensor * node)
if
(
operID
==
MATH_SUM
)
GradSum
(
node
);
else
if
(
operID
==
MATH_SUMDIM
)
GradSumDim
(
node
);
else
if
(
operID
==
MATH_MULTIPLY
)
GradMultiply
(
node
);
else
if
(
operID
==
MATH_MATRIXMUL
)
...
...
@@ -80,6 +82,90 @@ void XMathGrad::GradSum(XTensor * node)
}
/*
gradient for sum with one dimension
c = a + b * \beta
where the size of b is equal to dimension n of a, i.e., |b| = a.dimSize[n]
dE/da = dE/dc
dE/db = dE/dc * b.reduce(0,...,n-1,n+1,...) * \beta
*/
void
XMathGrad
::
GradSumDim
(
XTensor
*
node
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for SUM!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
int
n
=
income
.
GetParamInt
(
0
);
DTYPE
beta
=
income
.
GetParam
(
1
);
XNoder
::
MakeGrad
(
a
);
XNoder
::
MakeGrad
(
b
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
);
int
order
=
a
->
order
;
int
dimSize
[
MAX_TENSOR_DIM_NUM
];
memcpy
(
dimSize
,
a
->
dimSize
,
sizeof
(
int
)
*
a
->
order
);
if
(
n
==
order
-
1
){
int
reshapedSize
[
MAX_TENSOR_DIM_NUM
];
reshapedSize
[
0
]
=
a
->
unitNum
/
dimSize
[
order
-
1
];
reshapedSize
[
1
]
=
dimSize
[
order
-
1
];
/* we reshape dE/dc to a matrix whose column number is equal to the
size of b. Then we can reduce the matrix into a row vector. */
node
->
grad
->
Reshape
(
2
,
reshapedSize
);
if
(
b
->
outgo
.
tailNum
>
1
){
XTensor
*
bGradTMP
=
NewTensorBuf
(
b
->
grad
,
b
->
devID
,
b
->
mem
);
_ReduceSum
(
node
->
grad
,
bGradTMP
,
0
);
_Sum
(
bGradTMP
,
b
->
grad
,
b
->
grad
);
DelTensorBuf
(
bGradTMP
);
}
else
_ReduceSum
(
node
->
grad
,
b
->
grad
,
0
);
node
->
grad
->
Reshape
(
order
,
dimSize
);
}
else
{
int
reshapedSize
[
MAX_TENSOR_DIM_NUM
];
reshapedSize
[
0
]
=
1
;
reshapedSize
[
1
]
=
dimSize
[
n
];
reshapedSize
[
2
]
=
1
;
for
(
int
i
=
0
;
i
<
order
;
i
++
){
if
(
i
<
n
)
reshapedSize
[
0
]
*=
dimSize
[
i
];
}
reshapedSize
[
2
]
=
a
->
unitNum
/
(
reshapedSize
[
0
]
*
reshapedSize
[
1
]);
/* we reshape dE/dc to a 3D tensor of size (x, y, z) where y = |b|.
Then reduce along with z and x to obtain dE/db. */
node
->
grad
->
Reshape
(
3
,
reshapedSize
);
XTensor
*
interGrad
=
NewTensorBuf
(
2
,
reshapedSize
,
b
->
devID
,
b
->
mem
,
b
->
dataType
,
b
->
denseRatio
);
_ReduceSum
(
node
->
grad
,
interGrad
,
2
);
if
(
b
->
outgo
.
tailNum
>
1
){
XTensor
*
bGradTMP
=
NewTensorBuf
(
b
->
grad
,
b
->
devID
,
b
->
mem
);
_ReduceSum
(
interGrad
,
bGradTMP
,
0
);
_Sum
(
bGradTMP
,
b
->
grad
,
b
->
grad
);
DelTensorBuf
(
bGradTMP
);
}
else
_ReduceSum
(
interGrad
,
b
->
grad
,
0
);
node
->
grad
->
Reshape
(
order
,
dimSize
);
DelTensorBuf
(
interGrad
);
}
node
->
visitMark
=
NODE_FINISHED
;
}
/*
gradient for multiply (dot production)
for
c = a * b
...
...
source/network/XBackwardMath.h
查看文件 @
a26caf40
...
...
@@ -44,6 +44,11 @@ private:
static
void
GradSum
(
XTensor
*
node
);
/* gradient for sum with one dimension: c = a + b * \beta
where the size of b is equal to that of one dimension of a */
static
void
GradSumDim
(
XTensor
*
node
);
/* gradient for multiply (dot production): c = a * b */
static
void
GradMultiply
(
XTensor
*
node
);
...
...
source/sample/fnnlm/FNNLM.cpp
查看文件 @
a26caf40
...
...
@@ -999,15 +999,11 @@ void ForwardAutoDiff(XTensor inputs[], XTensor &output, FNNModel &model)
hidden
=
Merge
(
hidden
,
2
,
0
);
/* hidden layers */
for
(
int
i
=
0
;
i
<
depth
;
i
++
){
b
=
Unsqueeze
(
model
.
hiddenB
[
i
],
0
,
batchSize
);
hidden
=
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
b
;
}
b
=
Unsqueeze
(
model
.
outputB
,
0
,
batchSize
);
for
(
int
i
=
0
;
i
<
depth
;
i
++
)
hidden
=
MMul
(
hidden
,
model
.
hiddenW
[
i
])
+
model
.
hiddenB
[
i
];
/* output layer */
output
=
LogSoftmax
(
MMul
(
hidden
,
model
.
outputW
)
+
b
,
1
);
output
=
LogSoftmax
(
MMul
(
hidden
,
model
.
outputW
)
+
model
.
outputB
,
1
);
//XLink::ShowNetwork(stderr, &output);
}
...
...
source/tensor/XName.cpp
查看文件 @
a26caf40
...
...
@@ -41,6 +41,8 @@ const char * GetOPName(int type)
return
"M_SIGN"
;
else
if
(
type
==
MATH_SUM
)
return
"M_SUM"
;
else
if
(
type
==
MATH_SUMDIM
)
return
"M_SUMDIM"
;
else
if
(
type
==
MATH_LOG
)
return
"M_LOG"
;
else
if
(
type
==
MATH_NORMALIZE
)
...
...
source/tensor/XTensor.cpp
查看文件 @
a26caf40
...
...
@@ -1885,12 +1885,13 @@ generate a XTensor which allocates data on the buffer
>> myDimSize - the size of each dimension
>> myMem - memory pool used to allocating the data array.
we actually allocate the data on the buffer associated with
the memory pool.
the memory pool
>> devID - device id
>> myDataType - unit size (e.g., int, float, and double)
>> myDenseRatio - how often an element has non-zero value
*/
XTensor
*
NewTensorBuf
(
const
int
myOrder
,
const
int
*
myDimSize
,
XMem
*
myMem
,
XTensor
*
NewTensorBuf
(
const
int
myOrder
,
const
int
*
myDimSize
,
int
devID
,
XMem
*
myMem
,
const
TENSOR_DATA_TYPE
myDataType
,
const
float
myDenseRatio
)
{
CheckNTErrors
(
myMem
!=
NULL
,
"No memory pool specified!"
);
...
...
@@ -1901,12 +1902,31 @@ XTensor * NewTensorBuf(const int myOrder, const int * myDimSize, XMem * myMem,
dims
[
0
]
=
-
abs
(
dims
[
0
]);
XTensor
*
tensor
=
NewTensor
(
myOrder
,
dims
,
myDataType
,
myDenseRatio
,
-
1
,
myMem
);
tensor
->
data
=
myMem
->
AllocBuf
(
myMem
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
if
(
myMem
!=
NULL
)
tensor
->
data
=
myMem
->
AllocBuf
(
myMem
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
else
tensor
->
data
=
XMemAlloc
(
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
return
tensor
;
}
/*
generate a XTensor which allocates data on the buffer
>> reference - reference tensor
>> devID - device id
>> myMem - memory pool used to allocating the data array.
we actually allocate the data on the buffer associated with
the memory pool
*/
XTensor
*
NewTensorBuf
(
const
XTensor
*
reference
,
int
devID
,
XMem
*
myMem
)
{
return
NewTensorBuf
(
reference
->
order
,
reference
->
dimSize
,
devID
,
myMem
,
reference
->
dataType
,
reference
->
denseRatio
);
}
/*
generate a dense vector
>> num - number of entries
>> myDataType - unit size (e.g., int, float, and double)
...
...
@@ -2056,7 +2076,7 @@ XTensor * NewTensor(XTensor * a, bool isFilledData)
free the data space of a given tensor
>> tensor - pointer to the tensor
*/
void
DelTensor
(
const
XTensor
*
tensor
)
void
DelTensor
(
XTensor
*
tensor
)
{
delete
tensor
;
}
...
...
@@ -2065,10 +2085,13 @@ void DelTensor(const XTensor * tensor)
free the data space of a given tensor (on the buffer)
>> tensor - pointer to the tensor
*/
void
DelTensorBuf
(
const
XTensor
*
tensor
)
void
DelTensorBuf
(
XTensor
*
tensor
)
{
CheckNTErrors
(
tensor
->
mem
!=
NULL
,
"No memory pool found!"
);
tensor
->
mem
->
ReleaseBuf
(
tensor
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
if
(
tensor
->
mem
!=
NULL
)
tensor
->
mem
->
ReleaseBuf
(
tensor
->
devID
,
tensor
->
unitNum
*
tensor
->
unitSize
);
else
XMemFree
(
tensor
->
devID
,
tensor
->
data
);
tensor
->
data
=
NULL
;
delete
tensor
;
}
...
...
source/tensor/XTensor.h
查看文件 @
a26caf40
...
...
@@ -391,9 +391,12 @@ XTensor * NewTensor(const int myOrder, const int * myDimSize, const TENSOR_DATA_
const
float
myDenseRatio
=
1
.
0
F
,
const
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* generate a XTensor which allocates data on the buffer */
XTensor
*
NewTensorBuf
(
const
int
myOrder
,
const
int
*
myDimSize
,
XMem
*
myMem
,
XTensor
*
NewTensorBuf
(
const
int
myOrder
,
const
int
*
myDimSize
,
int
devID
,
XMem
*
myMem
,
const
TENSOR_DATA_TYPE
myDataType
=
X_FLOAT
,
const
float
myDenseRatio
=
1
.
0
F
);
/* generate a XTensor which allocates data on the buffer */
XTensor
*
NewTensorBuf
(
const
XTensor
*
reference
,
int
devID
,
XMem
*
myMem
);
/* generate a dense vector */
XTensor
*
NewTensor1D
(
const
int
num
,
const
TENSOR_DATA_TYPE
myDataType
=
X_FLOAT
,
const
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
...
...
@@ -422,10 +425,10 @@ XTensor * NewTensor5D(const int d0, const int d1, const int d2, const int d3, co
XTensor
*
NewTensor
(
XTensor
*
a
,
bool
isFilledData
=
true
);
/* free the data space of a given tensor */
void
DelTensor
(
const
XTensor
*
tensor
);
void
DelTensor
(
XTensor
*
tensor
);
/* free the data space of a given tensor (on the buffer) */
void
DelTensorBuf
(
const
XTensor
*
tensor
);
void
DelTensorBuf
(
XTensor
*
tensor
);
}
/* end of the nts (NiuTrans.Tensor) namespace */
...
...
source/tensor/core/arithmetic/Sum.cpp
查看文件 @
a26caf40
...
...
@@ -24,6 +24,7 @@
#include "../../XUtility.h"
#include "Sum.h"
#include "Sum.cuh"
#include "SumDim.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor)
...
...
@@ -123,6 +124,33 @@ void _SumMe(XTensor * a, const XTensor * b, DTYPE beta)
{
_Sum
(
a
,
b
,
a
,
beta
);
}
/*
return a dimension if the sum is performed as SumDim (in more details in SumDim.h
>> a - a tensor
>> b - another tensor for sum
*/
int
GetSumDimIndex
(
const
XTensor
&
a
,
const
XTensor
&
b
)
{
if
(
a
.
order
<
b
.
order
)
return
-
1
;
int
hitCount
=
0
;
int
hitDim
=
-
1
;
for
(
int
i
=
0
;
i
<
b
.
order
;
i
++
){
if
(
b
.
dimSize
[
b
.
order
-
1
-
i
]
==
1
)
continue
;
else
if
(
b
.
dimSize
[
b
.
order
-
1
-
i
]
==
a
.
dimSize
[
a
.
order
-
1
-
i
]){
hitCount
++
;
hitDim
=
a
.
order
-
b
.
order
+
i
;
}
}
if
(
hitCount
==
1
)
return
hitDim
;
else
return
-
1
;
}
/*
tensor summation c = a + b * \beta (return a XTensor structure)
...
...
@@ -138,12 +166,28 @@ XTensor Sum(const XTensor &a, const XTensor &b, DTYPE beta)
XTensor
c
(
&
a
);
c
.
SetTMP
();
/* call _Sum function */
_Sum
(
&
a
,
&
b
,
&
c
,
beta
);
int
n
=
GetSumDimIndex
(
a
,
b
);
if
(
n
==
-
1
){
/* call _Sum function */
_Sum
(
&
a
,
&
b
,
&
c
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUM
);
XLink
::
AddParamToHead
(
&
c
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUM
);
XLink
::
AddParamToHead
(
&
c
,
beta
);
}
else
if
(
n
>=
0
&&
n
<
a
.
order
){
/* call _Sum function */
_SumDim
(
&
a
,
&
b
,
&
c
,
n
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUMDIM
);
XLink
::
AddParamToHeadInt
(
&
c
,
n
);
XLink
::
AddParamToHead
(
&
c
,
beta
);
}
else
{
ShowNTErrors
(
"Something is wrong!"
);
}
return
c
;
}
...
...
source/tensor/core/arithmetic/SumDim.cpp
查看文件 @
a26caf40
...
...
@@ -151,7 +151,7 @@ XTensor SumDim(const XTensor &a, const XTensor &b, int n, DTYPE beta)
c
.
SetTMP
();
/* call _Sum function */
_Sum
(
&
a
,
&
b
,
&
c
,
beta
);
_Sum
Dim
(
&
a
,
&
b
,
&
c
,
n
,
beta
);
/* tensor connections */
XLink
::
MakeLink
(
&
a
,
&
b
,
&
c
,
MATH_SUMDIM
);
...
...
source/tensor/core/arithmetic/SumDim.cu
查看文件 @
a26caf40
...
...
@@ -49,7 +49,7 @@ void KernelAddWithRow(T * a, T * b, T * c, int rowNum, int colNum, T beta)
return;
if(threadIdx.y == 0)
bv[threadIdx.x] =
b[col];
bv[threadIdx.x] = b[col];
__syncthreads();
...
...
@@ -89,7 +89,7 @@ void KernelAddWithCol(T * a, T * b, T * c, int rowNum, int colNum, int blockSize
return;
if(threadIdx.x == 0)
bv[threadIdx.y] =
b[row];
bv[threadIdx.y] = b[row];
__syncthreads();
...
...
@@ -138,6 +138,9 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
int cudaGrids[3];
int cudaBlocks[3];
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == DEFAULT_DTYPE){
if(stride > 1){
GDevs.GetCudaThread2D(a->devID, stride * blockNum, blockSize, MAX_INT, cudaGrids, cudaBlocks);
...
...
@@ -168,6 +171,8 @@ void _CudaSumDim(const XTensor * a, const XTensor * b, XTensor * c, int n, DTYPE
else {
ShowNTErrors("TODO!");
}
BacktoCudaDev(a->devID, devIDBackup);
}
#endif
...
...
source/tensor/core/movement/CopyIndexed.cpp
查看文件 @
a26caf40
...
...
@@ -37,8 +37,8 @@ copy indexed sub-tensors
>> indexSize - length of srcIndex (and tgtIndex)
>> tgtIndex - index of the target sub-tensors
>> copyNum - number of the sub-tensors we copy for each source index,
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
e.g., for srcIndex = [1,4] and copyNum = 2,
we actually copy the source sub-tensors 1, 2, 4, 5
*/
void
_CopyIndexed
(
const
XTensor
*
s
,
XTensor
*
t
,
int
dim
,
int
*
srcIndex
,
int
indexSize
,
int
*
tgtIndex
,
int
copyNum
)
{
...
...
@@ -73,17 +73,23 @@ void _CopyIndexed(const XTensor * s, XTensor * t, int dim, int * srcIndex, int i
int
*
realSrcIndex
=
new
int
[
realIndexSize
];
int
*
realTgtIndex
=
new
int
[
realIndexSize
];
for
(
int
i
=
0
;
i
<
indexOffsetNum
;
i
++
)
{
int
base
=
i
*
indexSize
*
copyNum
;
int
baseSrc
=
i
*
leadDimSizeSrc
;
int
baseTgt
=
i
*
leadDimSizeTgt
;
for
(
int
j
=
0
;
j
<
indexSize
;
j
++
)
{
int
offset
=
base
+
j
*
copyNum
;
int
*
rsi
=
realSrcIndex
+
offset
;
int
*
rti
=
realTgtIndex
+
offset
;
for
(
int
k
=
0
;
k
<
copyNum
;
k
++
)
{
r
ealSrcIndex
[
i
*
indexSize
*
copyNum
+
j
*
copyNum
+
k
]
=
i
*
leadDimSiz
eSrc
+
srcIndex
[
j
]
+
k
;
r
ealTgtIndex
[
i
*
indexSize
*
copyNum
+
j
*
copyNum
+
k
]
=
i
*
leadDimSiz
eTgt
+
tgtIndex
[
j
]
+
k
;
r
si
[
k
]
=
bas
eSrc
+
srcIndex
[
j
]
+
k
;
r
ti
[
k
]
=
bas
eTgt
+
tgtIndex
[
j
]
+
k
;
}
}
}
for
(
int
i
=
0
;
i
<
indexSize
;
i
++
)
{
CheckNTErrors
((
srcIndex
[
i
]
<
blockNumSrc
),
"Index is out of
rang
e!"
);
CheckNTErrors
((
tgtIndex
[
i
]
<
blockNumTgt
),
"Index is out of
rang
e!"
);
CheckNTErrors
((
srcIndex
[
i
]
<
blockNumSrc
),
"Index is out of
scop
e!"
);
CheckNTErrors
((
tgtIndex
[
i
]
<
blockNumTgt
),
"Index is out of
scop
e!"
);
}
_CopyBlocks
(
s
->
data
,
blockSizeSrc
*
s
->
unitSize
,
realSrcIndex
,
realIndexSize
,
t
->
data
,
realTgtIndex
,
s
->
mem
,
s
->
devID
);
...
...
source/tensor/core/shape/Unsqueeze.cu
查看文件 @
a26caf40
...
...
@@ -32,12 +32,108 @@ namespace nts { // namespace nts(NiuTrans.Tensor)
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
>> s - pointer to the source data array
>> blockSize - size of a block
>> totalSize - total size of the blocks (i.e., blockSIze * n)
>> t - pointer to the target data array
>> n - number of blocks to copy data
*/
template<class T>
__global__
void KernelUnsqueezeFlat(void * s, int blockSize, int totalSize, void * t, int n)
{
/* index of data items */
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= blockSize)
return;
T value = ((T*)s)[i];
T * tData = (T*)t;
__syncthreads();
for (int k = i; k < totalSize; k += blockSize)
tData[k] = value;
}
/*
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
>> s - pointer to the source data array
>> blockSize - size of a block
>> totalSize - total size of the blocks (i.e., blockSIze * n)
>> t - pointer to the target data array
>> n - number of blocks to copy data
*/
template<class T>
__global__
void KernelUnsqueezeFlatBigram(void * s, int blockSize, int totalSize, void * t, int n)
{
/* index of data items */
int i = (blockDim.x * blockIdx.x + threadIdx.x) * 2;
if (i >= blockSize)
return;
T value = ((T*)s)[i];
T value2 = ((T*)s)[i + 1];
T * tData = (T*)t;
__syncthreads();
for (int k = i; k < totalSize; k += blockSize){
tData[k] = value;
tData[k + 1] = value2;
}
}
/*
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
>> s - pointer to the source data array
>> blockSize - size of a block
>> totalSize - total size of the blocks (i.e., blockSIze * n)
>> t - pointer to the target data array
>> n - number of blocks to copy data
*/
template<class T>
__global__
void KernelUnsqueezeFlat2D(void * s, int blockSize, int totalSize, void * t, int n)
{
__shared__ T data[MAX_CUDA_THREAD_NUM_PER_BLOCK];
__shared__ int offsets[MAX_CUDA_THREAD_NUM_PER_BLOCK];
/* index of data items */
int i = blockDim.x * blockIdx.x + threadIdx.x;
/* index of data items */
int j = blockDim.y * blockIdx.y + threadIdx.y;
if (i >= blockSize || j >= n)
return;
if(threadIdx.y == 0)
data[threadIdx.x] = ((T*)s)[i];
if(threadIdx.x == 0)
offsets[threadIdx.y] = blockSize * j;
__syncthreads();
((T*)t)[offsets[threadIdx.y] + i] = data[threadIdx.x];
}
/*
insert a dimension by copying the blocks for n times (where n is the size of the inerted dimension)
>> s - pointer to the source data array
>> blockSize - size of a block
>> blockNum - number of the blocks
>> totalSize - total size of the blocks (i.e., blockSIze * n)
>> t - pointer to the target data array
>> n - number of blocks to copy data
*/
template<class T>
__global__
void KernelUnsqueeze(void * s, int blockSize, int blockNum, void * t, int n)
void KernelUnsqueeze(void * s, int blockSize, int blockNum,
int totalSize,
void * t, int n)
{
/* index of data items */
int i = blockDim.x * blockIdx.x + threadIdx.x;
...
...
@@ -51,11 +147,10 @@ void KernelUnsqueeze(void * s, int blockSize, int blockNum, void * t, int n)
MTYPE offset = blockSize * j;
T value = ((T*)s)[offset + i];
T * tData = (T*)t + offset * n;
int length = blockSize * n;
__syncthreads();
for (int k = i; k <
length
; k += blockSize)
for (int k = i; k <
totalSize
; k += blockSize)
tData[k] = value;
}
...
...
@@ -83,21 +178,71 @@ void _CudaUnsqueeze(const XTensor * a, XTensor * b, int dim, int dSize)
int cudaGrids[3];
int cudaBlocks[3];
GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
int devIDBackup = 0;
ProtectCudaDev(a->devID, devIDBackup);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, b->data, dSize);
if(blockNumA > 1){
GDevs.GetCudaThread2D(a->devID, blockSize, blockNumA, MAX_INT, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
KernelUnsqueeze<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, blockSize * dSize, b->data, dSize);
}
else {
ShowNTErrors("TODO!");
}
}
else if(blockNumA == 1 && blockSize < MAX_CUDA_THREAD_NUM_PER_BLOCK){
GDevs.GetCudaThread2D(a->devID, blockSize, dSize, MAX_CUDA_THREAD_NUM_PER_BLOCK/4, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
KernelUnsqueezeFlat2D<float> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
KernelUnsqueezeFlat2D<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else {
ShowNTErrors("TODO!");
}
}
else if(blockNumA == 1 && blockSize % 2 == 0){
GDevs.GetCudaThread(a->devID, blockSize/2, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
KernelUnsqueezeFlatBigram<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
KernelUnsqueezeFlatBigram<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else {
ShowNTErrors("TODO!");
}
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
KernelUnsqueeze<int> << <dim3(cudaGrids[0], cudaGrids[1]), dim3(cudaBlocks[0], cudaBlocks[1]) >> >
(a->data, blockSize, blockNumA, b->data, dSize);
else if(blockNumA == 1){
GDevs.GetCudaThread(a->devID, blockSize, cudaGrids, cudaBlocks);
if (a->dataType == X_FLOAT && a->dataType == X_FLOAT) {
KernelUnsqueezeFlat<float> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else if (a->dataType == X_INT && a->dataType == X_INT) {
KernelUnsqueezeFlat<int> << <dim3(cudaGrids[0]), dim3(cudaBlocks[0]) >> >
(a->data, blockSize, blockSize * dSize, b->data, dSize);
}
else {
ShowNTErrors("TODO!");
}
}
else
{
ShowNTErrors("
TODO
!");
else{
ShowNTErrors("
Something is wrong
!");
}
BacktoCudaDev(a->devID, devIDBackup);
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论