Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
N
NiuTrans.Tensor
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
Emmay
NiuTrans.Tensor
Commits
90dc67f2
Commit
90dc67f2
authored
Aug 05, 2018
by
xiaotong
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix bugs in back propagation and transformer
parent
50c3670f
隐藏空白字符变更
内嵌
并排
正在显示
9 个修改的文件
包含
100 行增加
和
115 行删除
+100
-115
source/network/XBackwardMath.cpp
+66
-95
source/sample/transformer/T2TAttention.cpp
+4
-3
source/sample/transformer/T2TEmbedding.cpp
+3
-3
source/sample/transformer/T2TFNN.cpp
+3
-2
source/sample/transformer/T2TOutput.cpp
+2
-1
source/sample/transformer/T2TTrainer.cpp
+4
-4
source/tensor/core/getandset/SetData.cpp
+1
-1
source/tensor/function/Softmax.cpp
+4
-4
source/tensor/function/Softmax.cu
+13
-2
没有找到文件。
source/network/XBackwardMath.cpp
查看文件 @
90dc67f2
...
...
@@ -468,21 +468,19 @@ void XMathGrad::GradPower(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for POWER!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensorBuf
(
a
,
a
->
devID
,
a
->
mem
);
DTYPE
p
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_Power
(
a
,
b
,
(
p
-
1
)
/
p
);
_ScaleAndShift
(
b
,
c
,
p
);
_Multiply
(
node
->
grad
,
c
,
a
->
grad
,
1.0
F
);
_Power
(
a
,
b
,
p
-
1.0
F
);
_ScaleAndShift
Me
(
b
,
p
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensor
(
b
)
;
delete
b
;
delete
c
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -499,16 +497,16 @@ void XMathGrad::GradNegate(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for NEGATE!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_ScaleAndShift
(
node
->
grad
,
b
,
-
1.0
F
);
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -525,18 +523,14 @@ void XMathGrad::GradScaleAndShift(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for SCALEANDSHIFT!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
DTYPE
scale
=
income
.
GetParam
(
0
);
XNoder
::
MakeGrad
(
a
);
_ScaleAndShift
(
node
->
grad
,
b
,
scale
);
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
_Sum
(
a
->
grad
,
node
->
grad
,
a
->
grad
,
scale
);
node
->
visitMark
=
NODE_FINISHED
;
delete
b
;
}
/*
...
...
@@ -582,9 +576,7 @@ void XMathGrad::GradDiv(XTensor * node)
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
XTensor
*
c
=
NewTensor
(
b
);
XTensor
*
d
=
NewTensor
(
b
);
XTensor
*
e
=
NewTensor
(
b
);
XTensor
*
ab2
=
NewTensorBuf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
XNoder
::
MakeGrad
(
b
);
...
...
@@ -592,16 +584,15 @@ void XMathGrad::GradDiv(XTensor * node)
CheckNTErrors
(
XTensor
::
IsSameShaped
(
a
,
b
),
"Wrong sized input tensors!"
);
_Div
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
_Power
(
b
,
c
,
-
2.0
F
);
_Multiply
(
a
,
c
,
d
);
_ScaleAndShift
(
d
,
e
,
-
1.0
F
);
_Multiply
(
node
->
grad
,
e
,
b
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
_Power
(
b
,
ab2
,
-
2.0
F
);
_Multiply
(
a
,
ab2
,
ab2
);
_ScaleAndShiftMe
(
ab2
,
-
1.0
F
);
_Multiply
(
node
->
grad
,
ab2
,
b
->
grad
,
1.0
F
);
delete
c
;
delete
d
;
delete
e
;
DelTensorBuf
(
ab2
)
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -618,16 +609,16 @@ void XMathGrad::GradExp(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for EXP!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_Exp
(
a
,
b
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -644,16 +635,16 @@ void XMathGrad::GradSin(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for SIN!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_Cos
(
a
,
b
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -670,19 +661,17 @@ void XMathGrad::GradCos(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for COS!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensorBuf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_Sin
(
a
,
b
);
_ScaleAndShift
(
b
,
c
,
-
1.0
F
);
_Multiply
(
node
->
grad
,
c
,
a
->
grad
,
1.0
F
);
_ScaleAndShift
Me
(
b
,
-
1.0
F
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
delete
c
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -699,19 +688,17 @@ void XMathGrad::GradTan(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for TAN!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensorBuf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_Cos
(
a
,
b
);
_Power
(
b
,
c
,
-
2.0
F
);
_Multiply
(
node
->
grad
,
c
,
a
->
grad
,
1.0
F
);
_Power
Me
(
b
,
-
2.0
F
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
delete
c
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -817,16 +804,16 @@ void XMathGrad::GradAbsolute(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for ABSOLUTE!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XNoder
::
MakeGrad
(
a
);
_Sign
(
a
,
b
);
_Multiply
(
node
->
grad
,
b
,
a
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -842,17 +829,9 @@ void XMathGrad::GradSign(XTensor * node)
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for SIGN!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XNoder
::
MakeGrad
(
a
);
b
->
SetZeroAll
();
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
// we do nothing here
node
->
visitMark
=
NODE_FINISHED
;
delete
b
;
}
/*
...
...
@@ -868,17 +847,9 @@ void XMathGrad::GradRound(XTensor * node)
XLink
&
income
=
node
->
income
;
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for ROUND!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XNoder
::
MakeGrad
(
a
);
b
->
SetZeroAll
();
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
// we do nothing here
node
->
visitMark
=
NODE_FINISHED
;
delete
b
;
}
/*
...
...
@@ -894,7 +865,7 @@ void XMathGrad::GradClip(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for CLIP!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
DTYPE
lower
=
income
.
GetParam
(
0
);
DTYPE
upper
=
income
.
GetParam
(
1
);
...
...
@@ -904,9 +875,9 @@ void XMathGrad::GradClip(XTensor * node)
_ClipBackward
(
node
,
a
,
node
->
grad
,
a
->
grad
,
lower
,
upper
);
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -923,21 +894,20 @@ void XMathGrad::GradReduceMean(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for Reduce!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensorBuf
(
a
,
a
->
devID
,
a
->
mem
);
int
dim
=
income
.
GetParamInt
(
0
);
int
n
=
a
->
GetDim
(
dim
);
XNoder
::
MakeGrad
(
a
);
_Unsqueeze
(
node
->
grad
,
b
,
dim
,
n
);
_ScaleAndShift
(
b
,
c
,
1.0
F
/
n
);
_Sum
(
a
->
grad
,
c
,
a
->
grad
);
_ScaleAndShift
Me
(
b
,
1.0
F
/
n
);
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
b
)
;
delete
b
;
delete
c
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -954,18 +924,19 @@ void XMathGrad::GradReduceSum(XTensor * node)
CheckNTErrors
(
income
.
tailNum
==
1
,
"Wrong input tensor number for Reduce!"
);
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
NewTensor
(
a
);
XTensor
*
b
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
int
dim
=
income
.
GetParamInt
(
0
);
int
n
=
a
->
GetDim
(
dim
);
XNoder
::
MakeGrad
(
a
);
_Unsqueeze
(
node
->
grad
,
b
,
dim
,
n
);
_Sum
(
a
->
grad
,
b
,
a
->
grad
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensor
(
b
)
;
delete
b
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -984,9 +955,9 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
d
=
NewTensor
(
b
);
XTensor
*
e
=
NewTensor
(
c
);
XTensor
*
c
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XTensor
*
d
=
NewTensor
Buf
(
b
,
b
->
devID
,
b
->
mem
);
XTensor
*
e
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
int
dim
=
income
.
GetParamInt
(
0
);
int
n
=
a
->
GetDim
(
dim
);
...
...
@@ -999,11 +970,11 @@ void XMathGrad::GradReduceSumSquared(XTensor * node)
_Multiply
(
e
,
c
,
a
->
grad
,
1.0
F
);
_Multiply
(
node
->
grad
,
d
,
b
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
c
);
DelTensorBuf
(
d
);
DelTensorBuf
(
e
);
delete
c
;
delete
d
;
delete
e
;
node
->
visitMark
=
NODE_FINISHED
;
}
/*
...
...
@@ -1022,9 +993,9 @@ void XMathGrad::GradReduceVariance(XTensor * node)
XTensor
*
a
=
income
.
tails
[
0
];
XTensor
*
b
=
income
.
tails
[
1
];
XTensor
*
c
=
NewTensor
(
a
);
XTensor
*
d
=
NewTensor
(
b
);
XTensor
*
e
=
NewTensor
(
a
);
XTensor
*
c
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
XTensor
*
d
=
NewTensor
Buf
(
b
,
b
->
devID
,
b
->
mem
);
XTensor
*
e
=
NewTensor
Buf
(
a
,
a
->
devID
,
a
->
mem
);
int
dim
=
income
.
GetParamInt
(
0
);
int
n
=
a
->
GetDim
(
dim
);
...
...
@@ -1037,11 +1008,11 @@ void XMathGrad::GradReduceVariance(XTensor * node)
_Multiply
(
e
,
c
,
a
->
grad
,
1.0
F
);
_Multiply
(
node
->
grad
,
d
,
b
->
grad
,
1.0
F
);
node
->
visitMark
=
NODE_FINISHED
;
DelTensorBuf
(
c
);
DelTensorBuf
(
d
);
DelTensorBuf
(
e
);
delete
c
;
delete
d
;
delete
e
;
node
->
visitMark
=
NODE_FINISHED
;
}
}
source/sample/transformer/T2TAttention.cpp
查看文件 @
90dc67f2
...
...
@@ -66,8 +66,9 @@ void T2TAttention::InitModel(int argc, const char ** argv, int myDevID, XMem * m
InitTensor2D
(
&
wq
,
d
,
dk
,
X_FLOAT
,
devID
,
mem
);
InitTensor2D
(
&
wv
,
d
,
dv
,
X_FLOAT
,
devID
,
mem
);
float
finfoutk
=
sqrt
(
6
/
(
d
+
dk
));
float
finfoutv
=
sqrt
(
6
/
(
d
+
dv
));
float
scale
=
1.0
F
;
float
finfoutk
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dk
));
float
finfoutv
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
d
+
dv
));
wk
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
wq
.
SetDataRand
(
-
finfoutk
,
finfoutk
);
...
...
@@ -107,7 +108,7 @@ XTensor T2TAttention::Make(XTensor &k, XTensor &q, XTensor &v)
XTensor
scalar
;
/* scalar = softmax(Q * K^T / sqrt(dk)) * V */
scalar
=
Softmax
(
Linear
(
BMMul
(
qheads
,
X_NOTRANS
,
kheads
,
X_TRANS
),
1
/
sqrt
((
float
)
dk
)),
-
1
);
scalar
=
Softmax
(
Linear
(
BMMul
(
qheads
,
X_NOTRANS
,
kheads
,
X_TRANS
),
1
/
(
float
)
sqrt
((
float
)
dk
)),
-
1
);
att
=
BMMul
(
scalar
,
vheads
);
/* concatenate the heads */
...
...
source/sample/transformer/T2TEmbedding.cpp
查看文件 @
90dc67f2
...
...
@@ -62,7 +62,7 @@ void T2TEmbedder::InitModel(int argc, const char ** argv, int myDevID, XMem * my
InitTensor2D
(
&
w
,
vSize
,
eSize
,
X_FLOAT
,
devID
,
mem
);
w
.
SetDataRandn
(
0
,
1
/
sqrt
((
float
)
eSize
));
w
.
SetDataRandn
(
0
,
1
/
(
float
)
sqrt
((
float
)
eSize
));
/* create the positional embedding matrix */
MakePosEmbedding
(
eSize
,
d
,
maxLength
);
...
...
@@ -84,11 +84,11 @@ void T2TEmbedder::MakePosEmbedding(int eSize, int d, int length)
for
(
int
k
=
0
;
k
<
eSize
;
k
++
){
if
(
k
%
2
==
0
){
int
i
=
k
/
2
;
dp
[
k
]
=
sin
(
pos
/
pow
(
10000.0
F
,
2.0
F
*
i
/
d
));
dp
[
k
]
=
(
float
)
sin
(
pos
/
pow
(
10000.0
F
,
2.0
F
*
i
/
d
));
}
else
{
int
i
=
(
k
-
1
)
/
2
;
dp
[
k
]
=
cos
(
pos
/
pow
(
10000.0
F
,
2.0
F
*
i
/
d
));
dp
[
k
]
=
(
float
)
cos
(
pos
/
pow
(
10000.0
F
,
2.0
F
*
i
/
d
));
}
}
}
...
...
source/sample/transformer/T2TFNN.cpp
查看文件 @
90dc67f2
...
...
@@ -67,8 +67,9 @@ void T2TFNN::InitModel(int argc, const char ** argv, int myDevID, XMem * myMem)
InitTensor2D
(
&
w2
,
hSize
,
outSize
,
X_FLOAT
,
devID
,
mem
);
InitTensor1D
(
&
b2
,
outSize
,
X_FLOAT
,
devID
,
mem
);
float
finfout1
=
sqrt
(
6
/
(
inSize
+
hSize
));
float
finfout2
=
sqrt
(
6
/
(
hSize
+
outSize
));
float
scale
=
1.0
F
;
float
finfout1
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
inSize
+
hSize
));
float
finfout2
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
hSize
+
outSize
));
w1
.
SetDataRand
(
-
finfout1
,
finfout1
);
b1
.
SetZeroAll
();
...
...
source/sample/transformer/T2TOutput.cpp
查看文件 @
90dc67f2
...
...
@@ -63,7 +63,8 @@ void T2TOutput::InitModel(int argc, const char ** argv, int myDevID, XMem * myMe
InitTensor2D
(
&
w
,
hSize
,
vSize
,
X_FLOAT
,
devID
,
mem
);
float
finfout
=
sqrt
(
6
/
(
hSize
+
vSize
));
float
scale
=
1.0
F
;
float
finfout
=
(
float
)
sqrt
(
6.0
F
*
scale
/
(
hSize
+
vSize
));
w
.
SetDataRand
(
-
finfout
,
finfout
);
}
...
...
source/sample/transformer/T2TTrainer.cpp
查看文件 @
90dc67f2
...
...
@@ -112,8 +112,8 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
net
.
Backward
(
output
,
batch
,
CROSSENTROPY
);
/* learning rate */
lr
=
(
1
/
sqrt
((
float
)
d
))
*
MIN
(
pow
(
step
+
1
,
-
0.5
),
(
step
+
1
)
*
pow
(
nwarmup
,
-
1.5
));
lr
=
0.00000
5
F
;
lr
=
(
1
/
(
float
)
sqrt
((
float
)
d
))
*
(
float
)
MIN
(
pow
(
step
+
1
,
-
0.5
),
(
step
+
1
)
*
pow
(
nwarmup
,
-
1.5
));
lr
=
0.00000
2
F
;
/* update the parameters */
Update
(
model
,
lr
);
...
...
@@ -132,7 +132,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
if
(
step
%
1
==
0
)
{
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT6
(
0
,
stderr
,
"[INFO] lr=%
e, elapsed=%.1fs, step=%d, epoch=%d, ngram
=%d, ppl=%.3f
\n
"
,
XPRINT6
(
0
,
stderr
,
"[INFO] lr=%
.2e, elapsed=%.1fs, step=%d, epoch=%d, word
=%d, ppl=%.3f
\n
"
,
lr
,
elapsed
,
step
,
epoch
+
1
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
}
}
...
...
@@ -142,7 +142,7 @@ void T2TTrainer::Train(const char * fn, T2TModel * model)
double
elapsed
=
GetClockSec
()
-
startT
;
XPRINT6
(
0
,
stderr
,
"[INFO] lr=%
e, elapsed=%.1fs, step=%d, epoch=%d, ngram
=%d, ppl=%.3f
\n
"
,
XPRINT6
(
0
,
stderr
,
"[INFO] lr=%
.2e, elapsed=%.1fs, step=%d, epoch=%d, word
=%d, ppl=%.3f
\n
"
,
lr
,
elapsed
,
step
,
epoch
,
wordCountTotal
,
exp
(
loss
/
wordCount
));
XPRINT3
(
0
,
stderr
,
"[INFO] training finished (took %.1fs, step=%d and epoch=%d)
\n
"
,
elapsed
,
step
,
epoch
);
...
...
source/tensor/core/getandset/SetData.cpp
查看文件 @
90dc67f2
...
...
@@ -71,7 +71,7 @@ void _SetDataFanInOut(XTensor * tensor, DTYPE gain)
}
DTYPE
std
=
gain
*
(
float
)
sqrt
(
2.0
/
(
fanIn
+
fanOut
));
DTYPE
a
=
sqrt
(
3.0
)
*
std
;
DTYPE
a
=
(
DTYPE
)
sqrt
(
3.0
)
*
std
;
_SetDataRand
(
tensor
,
-
a
,
a
);
}
...
...
source/tensor/function/Softmax.cpp
查看文件 @
90dc67f2
...
...
@@ -103,10 +103,10 @@ void _Softmax(const XTensor * x, XTensor * y, int leadDim)
else
{
for
(
int
i
=
0
;
i
<
n
;
i
++
){
DTYPE
r
=
(
DTYPE
)
exp
(
ip
[
i
*
m
+
j
]
-
mp
[
j
])
/
sp
[
j
];
if
(
IsNAN
(
r
)
)
r
=
DTYPE_MIN
;
if
(
IsINF
(
r
)
)
r
=
DTYPE_MIN
;
if
(
r
>
(
DTYPE
)
1.0
F
)
r
=
(
DTYPE
)
1.0
F
;
else
if
(
r
<
0
)
r
=
0
;
op
[
i
*
m
+
j
]
=
r
;
}
}
...
...
source/tensor/function/Softmax.cu
查看文件 @
90dc67f2
...
...
@@ -85,7 +85,13 @@ void KernelSoftmaxComputeTensor(DTYPE * x, DTYPE * max, DTYPE * sum, DTYPE * y,
if(i < strideSizeTotal && j < strideNum){
int offset = int(i / stride) * blockSize + j * stride + i2[threadIdx.x];
y[offset] = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
DTYPE r = exp(x[offset] - xMax[threadIdx.x])/xSum[threadIdx.x];
if (r >(DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
y[offset] = r;
}
}
...
...
@@ -194,7 +200,12 @@ void KernelSoftmaxComputeTensorUseBroadcast(DTYPE * input, DTYPE * max, DTYPE *
maxData = broadcast(maxData);
if (i < strideNum){
int offset = int(j / stride) * blockSize + i * stride + i2;
output[offset] = exp(input[offset] - maxData) / sumData;
DTYPE r = exp(input[offset] - maxData) / sumData;
if (r > (DTYPE)1.0F)
r = (DTYPE)1.0F;
else if (r < 0)
r = 0;
output[offset] = r;
}
}
}
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论