Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
0b43acf6
Commit
0b43acf6
authored
Feb 03, 2019
by
姜雨帆
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update
parent
896e5231
显示空白字符变更
内嵌
并排
正在显示
21 个修改的文件
包含
427 行增加
和
67 行删除
+427
-67
source/network/XBackwardMath.cpp
+2
-0
source/sample/transformer/T2TAttention.cpp
+28
-1
source/sample/transformer/T2TAttention.h
+2
-1
source/sample/transformer/T2TDecoder.cpp
+38
-9
source/sample/transformer/T2TDecoder.h
+48
-1
source/sample/transformer/T2TEmbedding.cpp
+7
-1
source/sample/transformer/T2TEmbedding.h
+1
-1
source/sample/transformer/T2TEncoder.cpp
+5
-4
source/sample/transformer/T2TModel.cpp
+10
-7
source/sample/transformer/T2TOutput.cpp
+1
-1
source/sample/transformer/T2TTrainer.cpp
+125
-24
source/sample/transformer/T2TTrainer.h
+31
-0
source/sample/transformer/Transformer.cpp
+9
-4
source/tensor/XDevice.cpp
+5
-0
source/tensor/XDevice.h
+3
-0
source/tensor/XTensor.cpp
+6
-0
source/tensor/core/getandset/SetData.cpp
+34
-1
source/tensor/core/getandset/SetData.cu
+54
-5
source/tensor/core/getandset/SetData.cuh
+5
-1
source/tensor/core/getandset/SetData.h
+5
-1
source/tensor/function/Dropout.cpp
+8
-5
没有找到文件。
source/network/XBackwardMath.cpp
查看文件 @
0b43acf6
...
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
...
@@ -71,6 +71,8 @@ void XMathGrad::MakeGrad(XTensor * node, bool isEfficient)
GradMultiply
(
node
,
isEfficient
);
GradMultiply
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYDIM
)
else
if
(
operID
==
MATH_MULTIPLYDIM
)
GradMultiplyDim
(
node
,
isEfficient
);
GradMultiplyDim
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_MULTIPLYBROADCAST
)
GradMultiplyBroadcast
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NEGATE
)
else
if
(
operID
==
MATH_NEGATE
)
GradNegate
(
node
,
isEfficient
);
GradNegate
(
node
,
isEfficient
);
else
if
(
operID
==
MATH_NORMALIZE
)
else
if
(
operID
==
MATH_NORMALIZE
)
...
...
source/sample/transformer/T2TAttention.cpp
查看文件 @
0b43acf6
...
@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
...
@@ -75,16 +75,19 @@ void T2TAttention::InitModel(int argc, char ** argv,
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wa
,
d
,
d
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wa
,
d
,
d
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wbig
,
d
,
3
*
d
,
X_FLOAT
,
devID
,
mem
);
float
scale
=
1.0
F
;
float
scale
=
1.0
F
;
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
float
finfouta
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
d
));
float
finfouta
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
d
));
float
finfoutbig
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
3
*
d
));
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wv
.
SetDataRand
(
-
finfoutv
,
finfoutv
);
wv
.
SetDataRand
(
-
finfoutv
,
finfoutv
);
wa
.
SetDataRand
(
-
finfouta
,
finfouta
);
wa
.
SetDataRand
(
-
finfouta
,
finfouta
);
wbig
.
SetDataRand
(
-
finfoutbig
,
finfoutbig
);
}
}
/*
/*
...
@@ -98,16 +101,40 @@ make the network
...
@@ -98,16 +101,40 @@ make the network
>> isTraining - indicates whether the model is used for training
>> isTraining - indicates whether the model is used for training
<< return - multi-attention result
<< return - multi-attention result
*/
*/
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
)
XTensor
T2TAttention
::
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
)
{
{
XTensor
k2
;
XTensor
k2
;
XTensor
q2
;
XTensor
q2
;
XTensor
v2
;
XTensor
v2
;
if
(
selfatt
){
XTensor
con
;
XList
split
;
con
=
MMul
(
k
,
wbig
);
int
d1
=
con
.
GetDim
(
0
);
int
d2
=
con
.
GetDim
(
1
);
int
d3
=
con
.
GetDim
(
2
)
/
3
;
InitTensor3D
(
&
k2
,
d1
,
d2
,
d3
,
X_FLOAT
,
devID
,
mem
);
InitTensor3D
(
&
q2
,
d1
,
d2
,
d3
,
X_FLOAT
,
devID
,
mem
);
InitTensor3D
(
&
v2
,
d1
,
d2
,
d3
,
X_FLOAT
,
devID
,
mem
);
split
.
Add
(
&
q2
);
split
.
Add
(
&
k2
);
split
.
Add
(
&
v2
);
Split
(
con
,
split
,
2
,
3
);
}
else
{
/* linear transofmration before self-attention */
/* linear transofmration before self-attention */
k2
=
MMul
(
k
,
wk
);
k2
=
MMul
(
k
,
wk
);
q2
=
MMul
(
q
,
wq
);
q2
=
MMul
(
q
,
wq
);
v2
=
MMul
(
v
,
wv
);
v2
=
MMul
(
v
,
wv
);
}
XTensor
kheads
;
XTensor
kheads
;
XTensor
qheads
;
XTensor
qheads
;
...
...
source/sample/transformer/T2TAttention.h
查看文件 @
0b43acf6
...
@@ -60,6 +60,7 @@ public:
...
@@ -60,6 +60,7 @@ public:
/* transformation after dot-product attention */
/* transformation after dot-product attention */
XTensor
wa
;
XTensor
wa
;
XTensor
wbig
;
/* size of transformed Q and K */
/* size of transformed Q and K */
int
dk
;
int
dk
;
...
@@ -95,7 +96,7 @@ public:
...
@@ -95,7 +96,7 @@ public:
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
/* make the network */
/* make the network */
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
);
XTensor
Make
(
XTensor
&
k
,
XTensor
&
q
,
XTensor
&
v
,
XTensor
&
mask
,
bool
isTraining
,
bool
selfatt
);
};
};
}
}
...
...
source/sample/transformer/T2TDecoder.cpp
查看文件 @
0b43acf6
...
@@ -21,6 +21,8 @@
...
@@ -21,6 +21,8 @@
#include <math.h>
#include <math.h>
#include "T2TDecoder.h"
#include "T2TDecoder.h"
#include "T2TUtility.h"
#include "T2TLayerNormal.h"
#include "../../tensor/core/CHeader.h"
#include "../../tensor/core/CHeader.h"
namespace
transformer
namespace
transformer
...
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
...
@@ -53,16 +55,43 @@ void AttDecoder::InitModel(int argc, char ** argv,
bool
myIsMasked
,
int
myIgnored
,
bool
myIsMasked
,
int
myIgnored
,
int
myDevID
,
XMem
*
myMem
)
int
myDevID
,
XMem
*
myMem
)
{
{
AttEncoder
::
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
//
AttEncoder::InitModel(argc, argv, myIsMasked, myIgnored, myDevID, myMem);
devID
=
myDevID
;
mem
=
myMem
;
ignored
=
myIgnored
;
LoadParamInt
(
argc
,
argv
,
"nlayer"
,
&
nlayer
,
6
);
LoadParamInt
(
argc
,
argv
,
"hsize"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"esize"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
LoadParamFloat
(
argc
,
argv
,
"dropout"
,
&
dropoutP
,
0
);
CheckNTErrors
(
nlayer
>=
1
,
"We have one encoding layer at least!"
);
CheckNTErrors
(
vSize
>
1
,
"set vocabulary size by
\"
-vsize
\"
"
);
/* embedding model */
embedder
.
InitModel
(
argc
,
argv
,
devID
,
mem
,
false
);
attentions
=
new
T2TAttention
[
nlayer
];
fnns
=
new
T2TFNN
[
nlayer
];
attLayerNorms
=
new
T2TLN
[
nlayer
];
fnnLayerNorms
=
new
T2TLN
[
nlayer
];
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attentionsEnde
=
new
T2TAttention
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
attEndeLayerNorms
=
new
T2TLN
[
nlayer
];
/* initialize the stacked layers */
/* initialize the stacked layers */
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
)
{
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
attentions
[
i
].
InitModel
(
argc
,
argv
,
myIsMasked
,
myIgnored
,
myDevID
,
myMem
);
fnns
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
fnnLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attentionsEnde
[
i
].
InitModel
(
argc
,
argv
,
true
,
myIgnored
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
attEndeLayerNorms
[
i
].
InitModel
(
argc
,
argv
,
myDevID
,
myMem
);
}
}
}
}
/*
/*
...
@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -82,7 +111,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
,
2
);
x
=
Dropout
(
x
,
dropoutP
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
XTensor
att
;
...
@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -93,11 +122,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/******************/
/******************/
/* self attention */
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
,
2
);
att
=
Dropout
(
att
,
dropoutP
);
/* residual connection */
/* residual connection */
res
=
Sum
(
att
,
x
);
res
=
Sum
(
att
,
x
);
...
@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -107,11 +136,11 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/*****************************/
/*****************************/
/* encoder-decoder attention */
/* encoder-decoder attention */
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
);
ende
=
attentionsEnde
[
i
].
Make
(
outputEnc
,
x
,
outputEnc
,
maskEncDec
,
isTraining
,
false
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
ende
=
Dropout
(
ende
,
dropoutP
,
2
);
ende
=
Dropout
(
ende
,
dropoutP
);
/* residual connection */
/* residual connection */
res
=
Sum
(
ende
,
x
);
res
=
Sum
(
ende
,
x
);
...
@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
...
@@ -125,7 +154,7 @@ XTensor AttDecoder::Make(XTensor &inputDec, XTensor &outputEnc, XTensor &mask, X
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
,
2
);
fnn
=
Dropout
(
fnn
,
dropoutP
);
/* residual connection */
/* residual connection */
res
=
Sum
(
fnn
,
x
);
res
=
Sum
(
fnn
,
x
);
...
...
source/sample/transformer/T2TDecoder.h
查看文件 @
0b43acf6
...
@@ -27,9 +27,56 @@
...
@@ -27,9 +27,56 @@
namespace
transformer
namespace
transformer
{
{
class
AttDecoder
:
public
AttEncoder
class
AttDecoder
{
{
public
:
public
:
/* device id */
int
devID
;
/* memory pool */
XMem
*
mem
;
/* layer number */
int
nlayer
;
/* hidden layer size of the FNN layer */
int
hSize
;
/* embedding size */
int
eSize
;
/* vocabulary size */
int
vSize
;
/* dropout probability */
DTYPE
dropoutP
;
/* some positions can be ignored in attention. this is useful in lm where the first position needs
* special design for the attention model. */
int
ignored
;
/* embedding of word at each position */
T2TEmbedder
embedder
;
/* FNN model of each layer */
T2TFNN
*
fnns
;
/* attention model of each layer */
T2TAttention
*
attentions
;
/* layer normalization for fnn */
T2TLN
*
fnnLayerNorms
;
/* layer normalization for attention */
T2TLN
*
attLayerNorms
;
/* input tensor of the encoder */
XTensor
*
input
;
/* output tensor of the encoder */
XTensor
*
output
;
/* encoder-decoder attention model of each layer */
/* encoder-decoder attention model of each layer */
T2TAttention
*
attentionsEnde
;
T2TAttention
*
attentionsEnde
;
...
...
source/sample/transformer/T2TEmbedding.cpp
查看文件 @
0b43acf6
...
@@ -48,12 +48,18 @@ initialize the model
...
@@ -48,12 +48,18 @@ initialize the model
>> myDevID - device id
>> myDevID - device id
>> myMem - the memory pool
>> myMem - the memory pool
*/
*/
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
)
void
T2TEmbedder
::
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
,
XMem
*
myMem
,
bool
isEnc
)
{
{
devID
=
myDevID
;
devID
=
myDevID
;
mem
=
myMem
;
mem
=
myMem
;
if
(
isEnc
){
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
}
else
{
LoadParamInt
(
argc
,
argv
,
"vsizetgt"
,
&
vSize
,
-
1
);
}
//LoadParamInt(argc, argv, "vsize", &vSize, -1);
LoadParamInt
(
argc
,
argv
,
"maxlen"
,
&
maxLength
,
512
);
LoadParamInt
(
argc
,
argv
,
"maxlen"
,
&
maxLength
,
512
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
eSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
d
,
DEFAULT_EMBEDDING_SIZE
);
...
...
source/sample/transformer/T2TEmbedding.h
查看文件 @
0b43acf6
...
@@ -71,7 +71,7 @@ public:
...
@@ -71,7 +71,7 @@ public:
~
T2TEmbedder
();
~
T2TEmbedder
();
/* initialize the model */
/* initialize the model */
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
);
void
InitModel
(
int
argc
,
char
**
argv
,
int
myDevID
=
-
1
,
XMem
*
myMem
=
NULL
,
bool
isEnc
=
true
);
/* make positional embeddings */
/* make positional embeddings */
void
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
);
void
MakePosEmbedding
(
int
eSize
,
int
d
,
int
length
);
...
...
source/sample/transformer/T2TEncoder.cpp
查看文件 @
0b43acf6
...
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
...
@@ -107,7 +107,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
x
=
Dropout
(
x
,
dropoutP
,
2
);
x
=
Dropout
(
x
,
dropoutP
);
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
for
(
int
i
=
0
;
i
<
nlayer
;
i
++
){
XTensor
att
;
XTensor
att
;
...
@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
...
@@ -116,11 +116,11 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
XTensor
res
;
XTensor
res
;
/* self attention */
/* self attention */
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
);
att
=
attentions
[
i
].
Make
(
x
,
x
,
x
,
mask
,
isTraining
,
true
);
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
att
=
Dropout
(
att
,
dropoutP
,
2
);
att
=
Dropout
(
att
,
dropoutP
);
/* residual connection */
/* residual connection */
res
=
Sum
(
att
,
x
);
res
=
Sum
(
att
,
x
);
...
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
...
@@ -133,7 +133,7 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, XTensor &maskEncDec, boo
/* dropout */
/* dropout */
if
(
isTraining
&&
dropoutP
>
0
)
if
(
isTraining
&&
dropoutP
>
0
)
fnn
=
Dropout
(
fnn
,
dropoutP
,
2
);
fnn
=
Dropout
(
fnn
,
dropoutP
);
/* residual connection */
/* residual connection */
res
=
Sum
(
fnn
,
x
);
res
=
Sum
(
fnn
,
x
);
...
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
...
@@ -160,3 +160,4 @@ XTensor AttEncoder::Make(XTensor &input, XTensor &mask, bool isTraining)
}
}
}
}
source/sample/transformer/T2TModel.cpp
查看文件 @
0b43acf6
...
@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
...
@@ -274,9 +274,10 @@ void T2TModel::MakeMT(XTensor &inputEnc, XTensor &inputDec, XTensor &output, XTe
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
_Sum
(
&
maskEnc
,
padding3
,
&
maskEnc
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
encoding
=
MakeEncoder
(
inputEnc
,
maskEnc
,
isTraining
);
//encoding.Dump(stderr, "encoding",10);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
decoding
=
MakeDecoder
(
inputDec
,
encoding
,
maskDec
,
maskEncDec
,
isTraining
);
//decoding.Dump(stderr, "decoding", 10);
outputLayer
->
Make
(
decoding
,
output
);
outputLayer
->
Make
(
decoding
,
output
);
delete
[]
dims
;
delete
[]
dims
;
...
@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list)
...
@@ -300,9 +301,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
encoder
->
fnns
[
i
].
b1
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b1
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
w2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
w2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b2
);
list
.
Add
(
&
encoder
->
fnns
[
i
].
b2
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wk
);
//list.Add(&encoder->attentions[i].wk);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wq
);
//list.Add(&encoder->attentions[i].wq);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wv
);
//list.Add(&encoder->attentions[i].wv);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
encoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
b
);
list
.
Add
(
&
encoder
->
fnnLayerNorms
[
i
].
b
);
...
@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list)
...
@@ -324,9 +326,10 @@ void T2TModel::GetParams(XList &list)
list
.
Add
(
&
decoder
->
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attentionsEnde
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
attEndeLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wk
);
//list.Add(&decoder->attentions[i].wk);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wq
);
//list.Add(&decoder->attentions[i].wq);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wv
);
//list.Add(&decoder->attentions[i].wv);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wbig
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
->
attentions
[
i
].
wa
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
w
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
b
);
list
.
Add
(
&
decoder
->
fnnLayerNorms
[
i
].
b
);
...
...
source/sample/transformer/T2TOutput.cpp
查看文件 @
0b43acf6
...
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
...
@@ -56,7 +56,7 @@ void T2TOutput::InitModel(int argc, char ** argv, int myDevID, XMem * myMem)
float
minmax
=
0
;
float
minmax
=
0
;
LoadParamInt
(
argc
,
argv
,
"vsize"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"vsize
tgt
"
,
&
vSize
,
-
1
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
inSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamInt
(
argc
,
argv
,
"d"
,
&
hSize
,
DEFAULT_EMBEDDING_SIZE
);
LoadParamFloat
(
argc
,
argv
,
"outputminmax"
,
&
minmax
,
0.08
F
);
LoadParamFloat
(
argc
,
argv
,
"outputminmax"
,
&
minmax
,
0.08
F
);
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
0b43acf6
...
@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
...
@@ -41,12 +41,15 @@ T2TTrainer::T2TTrainer()
seqLen2
=
NULL
;
seqLen2
=
NULL
;
nseqBuf
=
0
;
nseqBuf
=
0
;
nextSeq
=
-
1
;
nextSeq
=
-
1
;
nextBatch
=
-
1
;
argNum
=
0
;
argNum
=
0
;
argArray
=
NULL
;
argArray
=
NULL
;
buf
=
NULL
;
buf
=
NULL
;
buf2
=
NULL
;
buf2
=
NULL
;
bufBatch
=
NULL
;
bufSize
=
0
;
bufSize
=
0
;
bufBatchSize
=
0
;
seqOffset
=
NULL
;
seqOffset
=
NULL
;
}
}
...
@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
...
@@ -55,6 +58,7 @@ T2TTrainer::~T2TTrainer()
{
{
delete
[]
buf
;
delete
[]
buf
;
delete
[]
buf2
;
delete
[]
buf2
;
delete
[]
bufBatch
;
delete
[]
seqLen
;
delete
[]
seqLen
;
delete
[]
seqLen2
;
delete
[]
seqLen2
;
delete
[]
seqOffset
;
delete
[]
seqOffset
;
...
@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv)
...
@@ -117,9 +121,11 @@ void T2TTrainer::Init(int argc, char ** argv)
LoadParamBool
(
argc
,
argv
,
"smallbatch"
,
&
isSmallBatch
,
true
);
LoadParamBool
(
argc
,
argv
,
"smallbatch"
,
&
isSmallBatch
,
true
);
LoadParamBool
(
argc
,
argv
,
"bigbatch"
,
&
isBigBatch
,
false
);
LoadParamBool
(
argc
,
argv
,
"bigbatch"
,
&
isBigBatch
,
false
);
LoadParamBool
(
argc
,
argv
,
"debug"
,
&
isDebugged
,
false
);
LoadParamBool
(
argc
,
argv
,
"debug"
,
&
isDebugged
,
false
);
LoadParamBool
(
argc
,
argv
,
"randbatch"
,
&
isRandomBatch
,
false
);
buf
=
new
int
[
bufSize
];
buf
=
new
int
[
bufSize
];
buf2
=
new
int
[
bufSize
];
buf2
=
new
int
[
bufSize
];
bufBatch
=
new
BatchNode
[
bufSize
];
seqLen
=
new
int
[
bufSize
];
seqLen
=
new
int
[
bufSize
];
seqLen2
=
new
int
[
bufSize
];
seqLen2
=
new
int
[
bufSize
];
seqOffset
=
new
int
[
bufSize
];
seqOffset
=
new
int
[
bufSize
];
...
@@ -172,6 +178,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -172,6 +178,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
double
startT
=
GetClockSec
();
double
startT
=
GetClockSec
();
FILE
*
fileen
=
fopen
(
"enc.txt"
,
"w"
);
FILE
*
filede
=
fopen
(
"dec.txt"
,
"w"
);
for
(
epoch
=
1
;
epoch
<=
nepoch
;
epoch
++
){
for
(
epoch
=
1
;
epoch
<=
nepoch
;
epoch
++
){
#ifndef WIN32
#ifndef WIN32
if
(
isShuffled
)
if
(
isShuffled
)
...
@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -205,6 +214,10 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
CheckNTErrors
(
batchEnc
.
order
==
2
,
"wrong tensor order of the sequence batch"
);
//batchEnc.Dump(stderr, "enc",1);
//batchDec.Dump(stderr, "dec",1);
//paddingDec.Dump(stderr, "paddec");
/* output probabilities */
/* output probabilities */
XTensor
output
;
XTensor
output
;
...
@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -222,17 +235,18 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
LabelSmooth
(
&
gold
,
&
goldSmoothed
,
labelSmoothingP
);
LabelSmooth
(
&
gold
,
&
goldSmoothed
,
labelSmoothingP
);
/* make paddings for the output */
/* make paddings for the output */
if
(
output
.
GetDim
(
0
)
>
1
)
if
(
output
.
GetDim
(
0
)
>
0
)
PadOutput
(
&
output
,
&
gold
,
&
paddingDec
);
PadOutput
(
&
output
,
&
gold
,
&
paddingDec
);
/* get probabilities */
/* get probabilities */
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
float
prob
=
GetProb
(
&
output
,
&
gold
,
NULL
);
//printf("%f\n", prob);
//float prob = 0;
DTYPE
lossLocal
=
-
prob
/
wc
;
DTYPE
lossLocal
=
-
prob
/
wc
;
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
bool
doUpdate
=
(
!
IsNAN
(
lossLocal
)
&&
!
IsINF
(
lossLocal
)
&&
lossLocal
<
1e3
F
);
XTensor
&
g
=
labelSmoothingP
>
0
?
goldSmoothed
:
gold
;
XTensor
&
g
=
labelSmoothingP
>
0
?
goldSmoothed
:
gold
;
//doUpdate = false;
if
(
doUpdate
)
{
if
(
doUpdate
)
{
/* recale the output for normalized loss */
/* recale the output for normalized loss */
...
@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
...
@@ -292,6 +306,9 @@ void T2TTrainer::Train(const char * fn, const char * validFN, const char * model
MakeCheckpoint
(
model
,
validFN
,
modelFN
,
"epoch"
,
epoch
);
MakeCheckpoint
(
model
,
validFN
,
modelFN
,
"epoch"
,
epoch
);
}
}
fclose
(
fileen
);
fclose
(
filede
);
double
elapsed
=
GetClockSec
()
-
startT
;
double
elapsed
=
GetClockSec
()
-
startT
;
epoch
=
MIN
(
epoch
,
nepoch
);
epoch
=
MIN
(
epoch
,
nepoch
);
...
@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
...
@@ -434,11 +451,11 @@ void T2TTrainer::MakeCheckpoint(T2TModel * model, const char * validFN, const ch
sprintf
(
fn2
,
"%s.%s.%03d.output"
,
modelFN
,
label
,
id
);
sprintf
(
fn2
,
"%s.%s.%03d.output"
,
modelFN
,
label
,
id
);
model
->
Dump
(
fn
);
model
->
Dump
(
fn
);
if
(
validFN
!=
NULL
){
//
if(validFN != NULL){
T2TTrainer
trainer
;
//
T2TTrainer trainer;
trainer
.
Init
(
argNum
,
argArray
);
//
trainer.Init(argNum, argArray);
trainer
.
Test
(
validFN
,
fn2
,
model
);
//
trainer.Test(validFN, fn2, model);
}
//
}
delete
[]
fn
;
delete
[]
fn
;
delete
[]
fn2
;
delete
[]
fn2
;
...
@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
...
@@ -473,7 +490,8 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
int
wordCount
=
0
;
int
wordCount
=
0
;
while
(
fgets
(
line
,
MAX_SEQUENCE_LENGTH
-
1
,
file
)){
while
(
fgets
(
line
,
MAX_SEQUENCE_LENGTH
-
1
,
file
)){
int
len
=
(
int
)
strlen
(
line
);
int
len
=
(
int
)
strlen
(
line
);
if
(
line
[
0
]
==
'b'
)
break
;
while
(
line
[
len
-
1
]
==
'\r'
||
line
[
len
-
1
]
==
'\n'
){
while
(
line
[
len
-
1
]
==
'\r'
||
line
[
len
-
1
]
==
'\n'
){
line
[
len
-
1
]
=
0
;
line
[
len
-
1
]
=
0
;
len
--
;
len
--
;
...
@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
...
@@ -544,9 +562,14 @@ int T2TTrainer::LoadBuf(FILE * file, bool isSorted, int step)
node
.
offset
=
i
;
node
.
offset
=
i
;
node
.
p
=
buf
+
offset
;
node
.
p
=
buf
+
offset
;
node
.
size
=
0
;
node
.
size
=
0
;
for
(
int
j
=
0
;
j
<
step
;
j
++
)
int
max
=
0
;
for
(
int
j
=
0
;
j
<
step
;
j
++
){
node
.
size
+=
seqLen
[
i
+
j
];
node
.
size
+=
seqLen
[
i
+
j
];
node
.
value
=
seqLen
[
i
];
max
=
MAX
(
max
,
seqLen
[
i
+
j
]);
}
//node.value = seqLen[i+1]+seqLen[i];
//node.value = MAX(seqLen[i+1],seqLen[i]);
node
.
value
=
max
;
count
++
;
count
++
;
offset
+=
node
.
size
;
offset
+=
node
.
size
;
}
}
...
@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
...
@@ -768,6 +791,12 @@ int T2TTrainer::LoadBatchLM(FILE * file,
return
sc
;
return
sc
;
}
}
int
CompareBatchNode
(
const
void
*
a
,
const
void
*
b
)
{
return
((
BatchNode
*
)
b
)
->
key
-
((
BatchNode
*
)
a
)
->
key
;
}
/*
/*
load a batch of sequences (for MT)
load a batch of sequences (for MT)
>> file - the handle to the data file
>> file - the handle to the data file
...
@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -797,10 +826,70 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
devID
,
XMem
*
mem
,
int
devID
,
XMem
*
mem
,
bool
isTraining
)
bool
isTraining
)
{
{
if
(
nextSeq
<
0
||
nextSeq
>=
nseqBuf
)
//if (nextSeq < 0 || nextSeq >= nseqBuf)
// LoadBuf(file, isSorted, 2);
if
(
nextBatch
<
0
||
nextBatch
>=
bufBatchSize
)
{
LoadBuf
(
file
,
isSorted
,
2
);
LoadBuf
(
file
,
isSorted
,
2
);
int
seq
=
MAX
(
nextSeq
,
0
);
int
seq
=
0
;
bufBatchSize
=
0
;
nextBatch
=
0
;
/* we segment the buffer into batches */
while
(
seq
<
nseqBuf
)
{
int
wcEnc
=
0
;
int
wcDec
=
0
;
int
wnEnc
=
0
;
int
wnDec
=
0
;
int
maxEnc
=
0
;
int
maxDec
=
0
;
int
sc
=
0
;
while
(
seq
+
sc
<
nseqBuf
)
{
/* source-side sequence */
wnEnc
=
seqLen
[
seq
+
sc
];
/* target-side sequence */
wnDec
=
isDoubledEnd
?
seqLen
[
seq
+
sc
+
1
]
:
seqLen
[
seq
+
sc
+
1
]
-
1
;
int
tcEnc
=
isBigBatch
?
(
wcEnc
+
wnEnc
)
:
MAX
(
maxEnc
,
wnEnc
)
*
(
sc
+
2
)
/
2
;
int
tcDec
=
isBigBatch
?
(
wcDec
+
wnDec
)
:
MAX
(
maxDec
,
wnDec
)
*
(
sc
+
2
)
/
2
;
if
(
sc
!=
0
&&
sc
>
sBatch
*
2
&&
(
tcEnc
>
wBatch
||
tcDec
>
wBatch
))
break
;
wcEnc
+=
wnEnc
;
sc
+=
1
;
if
(
maxEnc
<
wnEnc
)
maxEnc
=
wnEnc
;
wcDec
+=
wnDec
;
sc
+=
1
;
if
(
maxDec
<
wnDec
)
maxDec
=
wnDec
;
}
BatchNode
&
batch
=
bufBatch
[
bufBatchSize
];
batch
.
beg
=
seq
;
batch
.
end
=
seq
+
sc
;
batch
.
maxEnc
=
maxEnc
;
batch
.
maxDec
=
maxDec
;
batch
.
key
=
rand
();
bufBatchSize
++
;
seq
=
seq
+
sc
;
}
if
(
isRandomBatch
)
qsort
(
bufBatch
,
bufBatchSize
,
sizeof
(
BatchNode
),
CompareBatchNode
);
}
/*int seq = MAX(nextSeq, 0);
int wcEnc = 0;
int wcEnc = 0;
int wcDec = 0;
int wcDec = 0;
int wnEnc = 0;
int wnEnc = 0;
...
@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -813,10 +902,8 @@ int T2TTrainer::LoadBatchMT(FILE * file,
while(seq + sc < nseqBuf){
while(seq + sc < nseqBuf){
/* source-side sequence */
wnEnc = seqLen[seq + sc];
wnEnc = seqLen[seq + sc];
/* target-side sequence */
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
wnDec = isDoubledEnd ? seqLen[seq + sc + 1] : seqLen[seq + sc + 1] - 1;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
int tcEnc = isBigBatch ? (wcEnc + wnEnc): MAX(maxEnc, wnEnc) * (sc + 2) / 2;
...
@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -841,8 +928,18 @@ int T2TTrainer::LoadBatchMT(FILE * file,
nextSeq = seq + sc;
nextSeq = seq + sc;
if(sc <= 0)
if(sc <= 0)
return 0;*/
if
(
bufBatchSize
<=
0
)
return
0
;
return
0
;
BatchNode
&
batch
=
bufBatch
[
nextBatch
++
];
int
seq
=
batch
.
beg
;
int
sc
=
batch
.
end
-
batch
.
beg
;
int
maxEnc
=
batch
.
maxEnc
;
int
maxDec
=
batch
.
maxDec
;
CheckNTErrors
(
sc
%
2
==
0
,
"The input samples must be paired"
);
int
sCount
=
sc
/
2
;
int
sCount
=
sc
/
2
;
int
seqSize
=
0
;
int
seqSize
=
0
;
int
dimsDec
[
3
]
=
{
sCount
,
maxDec
,
vsDec
};
int
dimsDec
[
3
]
=
{
sCount
,
maxDec
,
vsDec
};
...
@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -861,13 +958,14 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
wCountEnc
=
0
;
int
wCountEnc
=
0
;
int
wCountDec
=
0
;
int
wCountDec
=
0
;
int
wCountPad
=
0
;
int
wGold
=
0
;
int
wGold
=
0
;
wCount
=
0
;
wCount
=
0
;
int
*
batchEncValues
=
new
int
[
batchEnc
->
unitNum
];
int
*
batchEncValues
=
new
int
[
batchEnc
->
unitNum
];
int
*
batchDecValues
=
new
int
[
batchDec
->
unitNum
];
int
*
batchDecValues
=
new
int
[
batchDec
->
unitNum
];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//MTYPE * paddingEncOffsets = new MTYPE[sc * maxEnc / 2];
//
MTYPE * paddingDecOffsets = new MTYPE[sc * maxDec / 2];
MTYPE
*
paddingDecOffsets
=
new
MTYPE
[
sc
*
maxDec
/
2
];
MTYPE
*
goldOffsets
=
new
MTYPE
[
sc
*
maxDec
/
2
];
MTYPE
*
goldOffsets
=
new
MTYPE
[
sc
*
maxDec
/
2
];
memset
(
batchEncValues
,
0
,
sizeof
(
int
)
*
batchEnc
->
unitNum
);
memset
(
batchEncValues
,
0
,
sizeof
(
int
)
*
batchEnc
->
unitNum
);
...
@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -901,7 +999,10 @@ int T2TTrainer::LoadBatchMT(FILE * file,
int
num
=
buf
[
seqOffset
[
s
]
+
w
];
int
num
=
buf
[
seqOffset
[
s
]
+
w
];
batchDecValues
[
batchDec
->
GetOffset2D
(
sent
,
w
)]
=
num
;
batchDecValues
[
batchDec
->
GetOffset2D
(
sent
,
w
)]
=
num
;
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
//paddingDecOffsets[wCountDec] = paddingDec->GetOffset2D(sent, w);
if
(
w
<
len
-
1
){
paddingDecOffsets
[
wCountPad
++
]
=
paddingDec
->
GetOffset2D
(
sent
,
w
);
wCount
++
;
}
if
(
w
>
0
)
if
(
w
>
0
)
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
-
1
,
buf
[
seqOffset
[
s
]
+
w
]);
...
@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -911,7 +1012,7 @@ int T2TTrainer::LoadBatchMT(FILE * file,
else
else
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
goldOffsets
[
wGold
++
]
=
gold
->
GetOffset3D
(
sent
,
w
,
buf
[
seqOffset
[
s
]
+
w
+
1
]);
}
}
wCount
++
;
//
wCount++;
wCountDec
++
;
wCountDec
++
;
if
(
seqs
!=
NULL
)
if
(
seqs
!=
NULL
)
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
seqs
[
seqSize
++
]
=
buf
[
seqOffset
[
s
]
+
w
];
...
@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
...
@@ -924,19 +1025,19 @@ int T2TTrainer::LoadBatchMT(FILE * file,
}
}
batchDec
->
SetData
(
batchDecValues
,
batchDec
->
unitNum
);
batchDec
->
SetData
(
batchDecValues
,
batchDec
->
unitNum
);
//paddingDec->SetDataBatched(paddingDecOffsets, 1.0F, wCountDec
);
paddingDec
->
SetDataBatched
(
paddingDecOffsets
,
1.0
F
,
wCountPad
);
XTensor
*
tmp2
=
NewTensorBuf
(
paddingDec
,
devID
,
mem
);
//
XTensor * tmp2 = NewTensorBuf(paddingDec, devID, mem);
_ConvertDataType
(
batchDec
,
tmp2
);
//
_ConvertDataType(batchDec, tmp2);
_NotEqual
(
tmp2
,
paddingDec
,
0
);
//
_NotEqual(tmp2, paddingDec, 0);
DelTensorBuf
(
tmp2
);
//
DelTensorBuf(tmp2);
gold
->
SetDataBatched
(
goldOffsets
,
1.0
F
,
wGold
);
gold
->
SetDataBatched
(
goldOffsets
,
1.0
F
,
wGold
);
delete
[]
batchEncValues
;
delete
[]
batchEncValues
;
delete
[]
batchDecValues
;
delete
[]
batchDecValues
;
//delete[] paddingEncOffsets;
//delete[] paddingEncOffsets;
//
delete[] paddingDecOffsets;
delete
[]
paddingDecOffsets
;
delete
[]
goldOffsets
;
delete
[]
goldOffsets
;
return
sc
;
return
sc
;
...
...
source/sample/transformer/T2TTrainer.h
查看文件 @
0b43acf6
...
@@ -33,6 +33,25 @@ using namespace nts;
...
@@ -33,6 +33,25 @@ using namespace nts;
namespace
transformer
namespace
transformer
{
{
/* node to keep batch information */
struct
BatchNode
{
/* begining position */
int
beg
;
/* end position */
int
end
;
/* maximum word number on the encoder side */
int
maxEnc
;
/* maximum word number on the decoder side */
int
maxDec
;
/* a key for sorting */
int
key
;
};
/* trainer of the T2T model */
/* trainer of the T2T model */
class
T2TTrainer
class
T2TTrainer
{
{
...
@@ -49,9 +68,15 @@ public:
...
@@ -49,9 +68,15 @@ public:
/* another buffer */
/* another buffer */
int
*
buf2
;
int
*
buf2
;
/* batch buf */
BatchNode
*
bufBatch
;
/* buffer size */
/* buffer size */
int
bufSize
;
int
bufSize
;
/* size of batch buffer */
int
bufBatchSize
;
/* length of each sequence */
/* length of each sequence */
int
*
seqLen
;
int
*
seqLen
;
...
@@ -67,6 +92,9 @@ public:
...
@@ -67,6 +92,9 @@ public:
/* offset for next sequence in the buffer */
/* offset for next sequence in the buffer */
int
nextSeq
;
int
nextSeq
;
/* offset for next batch */
int
nextBatch
;
/* indicates whether the sequence is sorted by length */
/* indicates whether the sequence is sorted by length */
bool
isLenSorted
;
bool
isLenSorted
;
...
@@ -142,6 +170,9 @@ public:
...
@@ -142,6 +170,9 @@ public:
/* counterpart of "isSmallBatch" */
/* counterpart of "isSmallBatch" */
bool
isBigBatch
;
bool
isBigBatch
;
/* randomize batches */
bool
isRandomBatch
;
/* indicates whether we intend to debug the net */
/* indicates whether we intend to debug the net */
bool
isDebugged
;
bool
isDebugged
;
...
...
source/sample/transformer/Transformer.cpp
查看文件 @
0b43acf6
...
@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv)
...
@@ -59,23 +59,28 @@ int TransformerMain(int argc, const char ** argv)
LoadParamString
(
argc
,
args
,
"test"
,
testFN
,
""
);
LoadParamString
(
argc
,
args
,
"test"
,
testFN
,
""
);
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
LoadParamString
(
argc
,
args
,
"output"
,
outputFN
,
""
);
srand
((
unsigned
int
)
time
(
NULL
));
T2TTrainer
trainer
;
T2TTrainer
trainer
;
trainer
.
Init
(
argc
,
args
);
trainer
.
Init
(
argc
,
args
);
T2TModel
model
;
T2TModel
model
;
model
.
InitModel
(
argc
,
args
);
model
.
InitModel
(
argc
,
args
);
//if(strcmp(modelFN, ""))
//model.Read(modelFN);
/* learn model parameters */
/* learn model parameters */
if
(
strcmp
(
trainFN
,
""
))
if
(
strcmp
(
trainFN
,
""
))
trainer
.
Train
(
trainFN
,
testFN
,
strcmp
(
modelFN
,
""
)
?
modelFN
:
"checkpoint.model"
,
&
model
);
trainer
.
Train
(
trainFN
,
testFN
,
strcmp
(
modelFN
,
""
)
?
modelFN
:
"checkpoint.model"
,
&
model
);
/* save the final model */
/* save the final model */
if
(
strcmp
(
modelFN
,
""
)
&&
strcmp
(
trainFN
,
""
))
//
if(strcmp(modelFN, "") && strcmp(trainFN, ""))
model
.
Dump
(
modelFN
);
//
model.Dump(modelFN);
/* load the model if neccessary */
/* load the model if neccessary */
if
(
strcmp
(
modelFN
,
""
))
//
if(strcmp(modelFN, ""))
model
.
Read
(
modelFN
);
//
model.Read(modelFN);
T2TTrainer
tester
;
T2TTrainer
tester
;
tester
.
Init
(
argc
,
args
);
tester
.
Init
(
argc
,
args
);
...
...
source/tensor/XDevice.cpp
查看文件 @
0b43acf6
...
@@ -60,6 +60,7 @@ XDevice::~XDevice()
...
@@ -60,6 +60,7 @@ XDevice::~XDevice()
cublasDestroy
(
cublasHandle
);
cublasDestroy
(
cublasHandle
);
if
(
stream
!=
NULL
)
if
(
stream
!=
NULL
)
delete
stream
;
delete
stream
;
curandDestroyGenerator
(
gen
);
#endif
#endif
}
}
...
@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID)
...
@@ -82,6 +83,10 @@ void XDevice::Init(int myDevID)
cudaDeviceProp
prop
;
cudaDeviceProp
prop
;
cudaSetDevice
(
myDevID
);
cudaSetDevice
(
myDevID
);
curandCreateGenerator
(
&
gen
,
CURAND_RNG_PSEUDO_DEFAULT
);
curandSetPseudoRandomGeneratorSeed
(
gen
,
seed
);
if
(
cudaGetDeviceProperties
(
&
prop
,
devID
)
!=
cudaSuccess
){
if
(
cudaGetDeviceProperties
(
&
prop
,
devID
)
!=
cudaSuccess
){
XPRINT1
(
0
,
stderr
,
"cannot get GPU(%d) information."
,
devID
);
XPRINT1
(
0
,
stderr
,
"cannot get GPU(%d) information."
,
devID
);
exit
(
1
);
exit
(
1
);
...
...
source/tensor/XDevice.h
查看文件 @
0b43acf6
...
@@ -112,6 +112,9 @@ public:
...
@@ -112,6 +112,9 @@ public:
/* specify if the handle is initialized */
/* specify if the handle is initialized */
bool
isHandleReady
;
bool
isHandleReady
;
/* generater of random numbers */
curandGenerator_t
gen
;
#endif
#endif
...
...
source/tensor/XTensor.cpp
查看文件 @
0b43acf6
...
@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
...
@@ -1614,11 +1614,17 @@ void XTensor::Dump(FILE * file, const char * label, const int n, const int beg,
else
if
(
dataType
==
X_INT
)
{
else
if
(
dataType
==
X_INT
)
{
int
end
=
MIN
(
n
>
0
?
beg
+
n
:
beg
+
unitNum
,
unitNum
);
int
end
=
MIN
(
n
>
0
?
beg
+
n
:
beg
+
unitNum
,
unitNum
);
for
(
int
i
=
beg
;
i
<
end
;
i
++
){
for
(
int
i
=
beg
;
i
<
end
;
i
++
){
if
((
i
%
(
dimSize
[
1
])
==
0
)
&&
(
i
!=
0
))
{
fprintf
(
file
,
"
\n
"
);
}
int
f
=
((
int
*
)
d
)[
i
];
int
f
=
((
int
*
)
d
)[
i
];
if
(
i
==
beg
)
if
(
i
==
beg
)
fprintf
(
file
,
"%d"
,
f
);
fprintf
(
file
,
"%d"
,
f
);
else
else
fprintf
(
file
,
" %d"
,
f
);
fprintf
(
file
,
" %d"
,
f
);
//if((i%(dimSize[1]-1) == 0)&&(i!=0)) {
//fprintf(file, " \n");
//}
}
}
}
}
else
else
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
0b43acf6
...
@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper]
...
@@ -387,7 +387,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range
>> lower - lower value of the range
>> upper - upper value of the range
>> upper - upper value of the range
*/
*/
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
)
void
_SetDataRand
(
const
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
)
{
{
CheckNTErrors
(
upper
>
lower
,
"the high value must be greater than low value!"
);
CheckNTErrors
(
upper
>
lower
,
"the high value must be greater than low value!"
);
...
@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -432,6 +432,39 @@ void _SetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
}
}
/*
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void
_SetDataRandP
(
const
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
)
{
CheckNTErrors
(
tensor
->
dataType
==
DEFAULT_DTYPE
,
"TODO"
);
if
(
tensor
->
devID
<
0
)
{
_SetDataRand
(
tensor
,
lower
,
upper
);
DTYPE
*
data
=
(
DTYPE
*
)
tensor
->
data
;
for
(
int
i
=
0
;
i
<
tensor
->
unitNum
;
i
++
)
{
if
(
data
[
i
]
>=
p
)
data
[
i
]
=
value
;
else
data
[
i
]
=
0
;
}
}
else
{
#ifdef USE_CUDA
_CudaSetDataRandP
(
tensor
,
lower
,
upper
,
p
,
value
);
#else
ShowNTErrors
(
"Please recompile the code by specifying USE_CUDA"
);
#endif // USE_CUDA
}
}
/*
generate data items with a normal distribution with specified mean and standard deviation
generate data items with a normal distribution with specified mean and standard deviation
>> tensor - the tensor that keeps the data
>> tensor - the tensor that keeps the data
>> mean - mean or expectation of the distribution
>> mean - mean or expectation of the distribution
...
...
source/tensor/core/getandset/SetData.cu
查看文件 @
0b43acf6
...
@@ -186,6 +186,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
...
@@ -186,6 +186,26 @@ void KernelSetDataRandDouble(double * d, int size, DTYPE lower, DTYPE variance)
}
}
/*
/*
set data items to a pre-defined value if its value >= p, set it to 0 otherwise
>> d - pointer to the data array
>> size - size of the array
>> lower - low value of the range
>> variance - the variance of the range
*/
__global__
void KernelSetDataPCut(DTYPE * d, int size, DTYPE p, DTYPE value)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < size) {
if (d[i] >= p)
d[i] = value;
else
d[i] = 0;
}
}
/*
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
set data items along with a given dimension (and keep the remaining items unchanged) - kernel version
>> tensor - the tensor whose data array would be initialized
>> tensor - the tensor whose data array would be initialized
>> beg - the beginning position
>> beg - the beginning position
...
@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
...
@@ -437,7 +457,7 @@ generate data items with a uniform distribution in [lower, upper]
>> lower - lower value of the range
>> lower - lower value of the range
>> upper - upper value of the range
>> upper - upper value of the range
*/
*/
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
void _CudaSetDataRand(
const
XTensor * tensor, DTYPE lower, DTYPE upper)
{
{
CheckNTErrors(upper > lower, "the high value must be greater than low value!");
CheckNTErrors(upper > lower, "the high value must be greater than low value!");
...
@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
...
@@ -452,17 +472,46 @@ void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper)
int devIDBackup;
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
ProtectCudaDev(tensor->devID, devIDBackup);
curandGenerator_t gen;
curandGenerator_t & gen = GDevs.GPUs[tensor->devID].gen;
curandCreateGenerator (&gen, CURAND_RNG_PSEUDO_DEFAULT);
curandSetPseudoRandomGeneratorSeed(gen, time(NULL));
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
curandGenerateUniform(gen , (float*)tensor->data , tensor->unitNum);
curandDestroyGenerator(gen);
DTYPE variance = upper - lower;
DTYPE variance = upper - lower;
if(variance != 1.0F || lower != 0){
if (tensor->dataType == X_FLOAT)
if (tensor->dataType == X_FLOAT)
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandFloat <<<blocks, threads >>>((float*) tensor->data, tensor->unitNum, lower, variance);
else if (tensor->dataType == X_DOUBLE)
else if (tensor->dataType == X_DOUBLE)
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
KernelSetDataRandDouble <<<blocks, threads >>>((double*)tensor->data, tensor->unitNum, lower, variance);
}
BacktoCudaDev(tensor->devID, devIDBackup);
}
/*
generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise
>> tensor - the tensor whose data array would be initialized
>> lower - lower value of the range
>> upper - upper value of the range
>> p - the threshold
>> value - the value we intend to assign to the item
*/
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value)
{
_CudaSetDataRand(tensor, lower, upper);
int gridSize[3];
int blockSize[3];
GDevs.GetCudaThread(tensor->devID, tensor->unitNum, gridSize, blockSize);
dim3 blocks(gridSize[0]);
dim3 threads(blockSize[0]);
int devIDBackup;
ProtectCudaDev(tensor->devID, devIDBackup);
KernelSetDataPCut << <blocks, threads >> >((float*)tensor->data, tensor->unitNum, p, value);
BacktoCudaDev(tensor->devID, devIDBackup);
BacktoCudaDev(tensor->devID, devIDBackup);
}
}
...
...
source/tensor/core/getandset/SetData.cuh
查看文件 @
0b43acf6
...
@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
...
@@ -47,7 +47,11 @@ void _CudaSetDataIndexed(XTensor * source, XTensor * modify, int dim, int index)
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
void _CudaSetDataLowTri(XTensor * tensor, DTYPE p, int shift);
/* generate data items with a uniform distribution in [lower, upper] */
/* generate data items with a uniform distribution in [lower, upper] */
void _CudaSetDataRand(XTensor * tensor, DTYPE lower, DTYPE upper);
void _CudaSetDataRand(const XTensor * tensor, DTYPE lower, DTYPE upper);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void _CudaSetDataRandP(const XTensor * tensor, DTYPE lower, DTYPE upper, DTYPE p, DTYPE value);
/* set the data with an array of offsets */
/* set the data with an array of offsets */
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
void _CudaSetDataWithOffset(XTensor * tensor, MTYPE * offsets, DTYPE value, MTYPE num);
...
...
source/tensor/core/getandset/SetData.h
查看文件 @
0b43acf6
...
@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
...
@@ -55,7 +55,11 @@ void _SetDataIndexed(XTensor * source, XTensor * modify, int dim, int index);
void
_SetDataLowTri
(
XTensor
*
tensor
,
DTYPE
p
,
int
shift
);
void
_SetDataLowTri
(
XTensor
*
tensor
,
DTYPE
p
,
int
shift
);
/* generate data items with a uniform distribution in [lower, upper] */
/* generate data items with a uniform distribution in [lower, upper] */
void
_SetDataRand
(
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
void
_SetDataRand
(
const
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
);
/* generate data items with a uniform distribution in [lower, upper] and set
the item to a pre-defined value if the item >= p, set the item to 0 otherwise */
void
_SetDataRandP
(
const
XTensor
*
tensor
,
DTYPE
lower
,
DTYPE
upper
,
DTYPE
p
,
DTYPE
value
);
/* generate data items with a normal distribution with specified mean and standard deviation */
/* generate data items with a normal distribution with specified mean and standard deviation */
void
_SetDataRandN
(
XTensor
*
tensor
,
DTYPE
mean
=
0
.
0
F
,
DTYPE
standardDeviation
=
1
.
0
F
);
void
_SetDataRandN
(
XTensor
*
tensor
,
DTYPE
mean
=
0
.
0
F
,
DTYPE
standardDeviation
=
1
.
0
F
);
...
...
source/tensor/function/Dropout.cpp
查看文件 @
0b43acf6
...
@@ -26,6 +26,7 @@
...
@@ -26,6 +26,7 @@
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/Multiply.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/arithmetic/MultiplyDim.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/math/ScaleAndShift.h"
#include "../core/getandset/SetData.h"
namespace
nts
{
// namespace nts(NiuTrans.Tensor
namespace
nts
{
// namespace nts(NiuTrans.Tensor
...
@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
...
@@ -147,17 +148,21 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
XTensor
mask
;
XTensor
mask
;
DTYPE
*
maskArray
=
NULL
;
DTYPE
*
maskArray
=
NULL
;
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
if
(
leadingDim
<
0
&&
leadingDim2
<
0
){
if
(
leadingDim
<
0
&&
leadingDim2
<
0
){
ShowNTErrors
(
"TODO"
);
XTensor
mask
;
InitTensor
(
&
mask
,
&
x
);
_SetDataRandP
(
&
mask
,
0
,
1.0
F
,
dropProb
,
scaleFactor
);
return
Multiply
(
x
,
mask
);
}
}
else
if
(
leadingDim2
<
0
){
else
if
(
leadingDim2
<
0
){
int
n
=
leadingDim
;
int
n
=
leadingDim
;
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
/* generate a mask tensor with probability p */
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
];
int
unitNum
=
x
.
dimSize
[
n
];
maskArray
=
new
DTYPE
[
unitNum
];
maskArray
=
new
DTYPE
[
unitNum
];
...
@@ -181,8 +186,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
...
@@ -181,8 +186,6 @@ XTensor Dropout(const XTensor &x, DTYPE dropProb, int leadingDim, int leadingDim
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
n
>=
0
&&
n
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
m
>=
0
&&
m
<
x
.
order
,
"Wrong leadingDim!"
);
CheckNTErrors
(
m
>=
0
&&
m
<
x
.
order
,
"Wrong leadingDim!"
);
DTYPE
scaleFactor
=
(
DTYPE
)
1.0
/
((
DTYPE
)
1.0
-
dropProb
);
/* generate a mask tensor with probability p */
/* generate a mask tensor with probability p */
int
unitNum
=
x
.
dimSize
[
n
]
*
x
.
dimSize
[
m
];
int
unitNum
=
x
.
dimSize
[
n
]
*
x
.
dimSize
[
m
];
maskArray
=
new
DTYPE
[
unitNum
];
maskArray
=
new
DTYPE
[
unitNum
];
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论