Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
杨迪
NiuTrans.Tensor
Commits
52a27964
Commit
52a27964
authored
Dec 28, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
better implementation of dropout
parent
a8304bed
显示空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
133 行增加
和
14 行删除
+133
-14
source/network/XBackwardMath.cpp
+64
-0
source/network/XBackwardMath.h
+10
-0
source/sample/transformer/T2TDecoder.cpp
+4
-4
source/sample/transformer/T2TEncoder.cpp
+3
-3
source/sample/transformer/T2TModel.cpp
+2
-0
source/sample/transformer/T2TTrainer.cpp
+0
-0
source/sample/transformer/Transformer.cpp
+1
-0
source/tensor/function/Dropout.cpp
+48
-6
source/tensor/function/Dropout.h
+1
-1
没有找到文件。
source/network/XBackwardMath.cpp
查看文件 @
52a27964
...
...
@@ -87,6 +87,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradSum
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMDIM
)
GradSumDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_SUMBROADCAST
)
GradSumBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCEMEAN
)
GradReduceMean
(
node
,
isEfficient
);
else
if
(
operID
==
REDUCE_REDUCESUM
)
...
...
@@ -817,6 +819,37 @@ void XMathGrad::GradMultiplyDim(XTensor * node, bool isEfficient)
}
/*
gradient for multiplication by broadcasting:
c = a * b
where some dimensions of b are of size 1
dE/da = dE/dc * b
dE/db = (dE/dc * a).reduce(0...n)
where a.reduce(0...n) is the reduction along the dimension
whose size is 1 in b. Note that there might be several reductions.
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for MULTIPLYBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_MultiplyBroadcast
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
gradient for negate
for
c = -a
...
...
@@ -1254,6 +1287,37 @@ void XMathGrad::GradSumDim(XTensor * node, bool isEfficient)
}
/*
gradient for sum by broadcasting:
c = a + b * \beta
where some dimensions of b are of size 1
dE/da = dE/dc
dE/db = dE/dc * a.reduce(0..n) * \beta
where a.reduce(0..n) is the reduction along the dimension
whose size is 1 in b
>> node - the node (c) for backward computation
>> isEfficient - indicates whether the computation is in
an efficient manner
*/
void
XMathGrad
::
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
)
{
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
2
,
"Wrong input tensor number for SUMBROADCAST!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
DTYPE
beta
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
);
if
(
b
->
isVar
||
b
->
income
.
tailNum
>
0
){
ShowNTErrors
(
"TODO"
);
}
}
/*
gradient for reduceMean
for
c = reduceMean(a, dim)
...
...
source/network/XBackwardMath.h
查看文件 @
52a27964
...
...
@@ -109,6 +109,11 @@ private:
static
void
GradMultiplyDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for multiply one dimension: c = a * b
where some dimensions of b are of size 1 */
static
void
GradMultiplyBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for negate */
static
void
GradNegate
(
XTensor
*
node
,
bool
isEfficient
);
...
...
@@ -143,6 +148,11 @@ private:
static
void
GradSumDim
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for sum by broadcasting: c = a + b * \beta
where some dimensions of b are of size 1 */
static
void
GradSumBroadcast
(
XTensor
*
node
,
bool
isEfficient
);
/* gradient for reduceMean */
static
void
GradReduceMean
(
XTensor
*
node
,
bool
isEfficient
);
...
...
source/sample/transformer/T2TDecoder.cpp
查看文件 @
52a27964
...
...
@@ -82,7 +82,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
);
x
=
Dropout
(
x
,
dropoutP
,
2
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
...
...
@@ -97,7 +97,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
att
=
Dropout
(
att
,
dropoutP
,
2
);
/* residual connection */
res
=
Sum
(
att
,
x
);
...
...
@@ -111,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
ende
=
Dropout
(
ende
,
dropoutP
);
ende
=
Dropout
(
ende
,
dropoutP
,
2
);
/* residual connection */
res
=
Sum
(
ende
,
x
);
...
...
@@ -125,7 +125,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
);
fnn
=
Dropout
(
fnn
,
dropoutP
,
2
);
/* residual connection */
res
=
Sum
(
fnn
,
x
);
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
52a27964
...
...
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
);
x
=
Dropout
(
x
,
dropoutP
,
2
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
...
...
@@ -120,7 +120,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
);
att
=
Dropout
(
att
,
dropoutP
,
2
);
/* residual connection */
res
=
Sum
(
att
,
x
);
...
...
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
);
fnn
=
Dropout
(
fnn
,
dropoutP
,
2
);
/* residual connection */
res
=
Sum
(
fnn
,
x
);
...
...
source/sample/transformer/T2TModel.cpp
查看文件 @
52a27964
...
...
@@ -274,7 +274,9 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
outputLayer
->
Make
(
decoding
,
output
);
delete
[]
dims
;
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
52a27964
source/sample/transformer/Transformer.cpp
查看文件 @
52a27964
...
...
@@ -60,6 +60,7 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
srand
((
unsigned
int
)
time
(
NULL
));
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
...
...
source/tensor/function/Dropout.cpp
查看文件 @
52a27964
...
...
@@ -39,7 +39,7 @@ for more details.
Here, the output is scaled by a factor of \frac{1}{1-p} so that we do not need
to mark the tensor with probability p in the inference phase. Instead we perform
the same inference procedure as that
with no use of dropout on the test data
.
the same inference procedure as that
on the test data withno nb use of dropout
.
>> x - input tensor
>> y - output tensor
...
...
@@ -138,12 +138,21 @@ the same inference procedure as that with no use of dropout on the test data.
>> x - input tensor
>> dropProb - probability to set an element to zero
>> leadingDim - the dimension which we generate the random numbers and perform broadcasting
>> leadingDim2 - another dimension which we generate the random numbers and perform broadcasting
<< return - tensor after dropout
*/
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
)
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
,
int
leadingDim2
)
{
CheckNTErrors
(
dropProb
>=
0.0
&&
dropProb
<=
1.0
,
"The probability must be 0-1!"
);
int
n
=
leadingDim
<
0
?
x
.
order
-
1
:
leadingDim
;
XTensor
mask
;
DTYPE
*
maskArray
=
NULL
;
if
(
leadingDim
<
0
&&
leadingDim2
<
0
){
ShowNTErrors
(
"TODO"
);
}
else
if
(
leadingDim2
<
0
){
int
n
=
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
...
...
@@ -151,7 +160,7 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
];
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
maskArray
=
new
DTYPE
[
unitNum
];
//srand((unsigned int)time(NULL));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
...
...
@@ -163,7 +172,41 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim)
delete
[]
maskArray
;
return
MultiplyDim
(
x
,
mask
,
n
,
0
);
return
MultiplyDim
(
x
,
mask
,
n
);
}
else
{
int
n
=
leadingDim
;
int
m
=
leadingDim2
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
m
>=
0
&&
m
<
x
.
order
,
"Wrong leadingDim!"
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
]
*
x
.
dimSize
[
m
];
maskArray
=
new
DTYPE
[
unitNum
];
//srand((unsigned int)time(NULL));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
int
dims
[
MAX_TENSOR_DIM_NUM
];
for
(
int
i
=
0
;
i
<
x
.
order
;
i
++
)
dims
[
i
]
=
1
;
dims
[
n
]
=
x
.
GetDim
(
n
);
dims
[
m
]
=
x
.
GetDim
(
m
);
InitTensor
(
&
mask
,
x
.
order
,
dims
,
x
.
dataType
,
x
.
denseRatio
,
x
.
devID
,
x
.
mem
);
mask
.
SetData
(
maskArray
,
unitNum
);
delete
[]
maskArray
;
return
MultiplyBroadcast
(
x
,
mask
);
}
}
/*
...
...
@@ -182,7 +225,6 @@ XTensor DropoutWithoutBroadcast(const XTensor &x, DTYPE dropProb)
int
unitNum
=
x
.
unitNum
;
DTYPE
*
maskArray
=
new
DTYPE
[
unitNum
];
srand
((
unsigned
int
)
time
(
NULL
));
for
(
int
i
=
0
;
i
<
unitNum
;
i
++
)
maskArray
[
i
]
=
RandomBernoulli
(
dropProb
,
scaleFactor
);
...
...
source/tensor/function/Dropout.h
查看文件 @
52a27964
...
...
@@ -41,7 +41,7 @@ void _DropoutBackward(const XTensor * y, const XTensor * x,
unsigned
int
seed
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
/* dropout function */
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
);
XTensor
Dropout
(
const
XTensor
&
x
,
DTYPE
dropProb
,
int
leadingDim
=
-
1
,
int
leadingDim2
=
-
1
);
/* dropout function without broadcast */
XTensor
DropoutWithoutBroadcast
(
const
XTensor
&
x
,
DTYPE
dropProb
);
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论