Skip to content
项目
群组
代码片段
帮助
当前项目
正在载入...
登录 / 注册
切换导航面板
T
Toy-MT-Introduction
概览
Overview
Details
Activity
Cycle Analytics
版本库
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
问题
0
Issues
0
列表
Board
标记
里程碑
合并请求
0
Merge Requests
0
CI / CD
CI / CD
流水线
作业
日程表
图表
维基
Wiki
代码片段
Snippets
成员
Collapse sidebar
Close sidebar
活动
图像
聊天
创建新问题
作业
提交
Issue Boards
Open sidebar
单韦乔
Toy-MT-Introduction
Commits
6d86f2c0
Commit
6d86f2c0
authored
Mar 07, 2020
by
liuhui
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
update slides
parent
be087cd2
隐藏空白字符变更
内嵌
并排
正在显示
1 个修改的文件
包含
22 行增加
和
22 行删除
+22
-22
Section05-Neural-Networks-and-Language-Modeling/section05.tex
+22
-22
没有找到文件。
Section05-Neural-Networks-and-Language-Modeling/section05.tex
查看文件 @
6d86f2c0
...
@@ -3040,7 +3040,7 @@ J(\textbf{w}_t) = L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
...
@@ -3040,7 +3040,7 @@ J(\textbf{w}_t) = L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
\item
<2->
\textbf
{
小批量梯度下降(Mini-batch Gradient Descent)
}
:
\item
<2->
\textbf
{
小批量梯度下降(Mini-batch Gradient Descent)
}
:
\begin{displaymath}
\begin{displaymath}
J(
\textbf
{
w
}_
t) =
\frac
{
1
}{
m
}
\sum
_{
i=j
}^{
j+m
}
L(
\textbf
{
x
}_
i,
\tilde
{
\textbf
{
y
}}_
i;
\textbf
{
w
}_
t)
J(
\textbf
{
w
}_
t) =
\frac
{
1
}{
m
}
\sum
_{
i=j
}^{
j+m
-1
}
L(
\textbf
{
x
}_
i,
\tilde
{
\textbf
{
y
}}_
i;
\textbf
{
w
}_
t)
\end{displaymath}
\end{displaymath}
每次随机使用若干样本进行参数更新(数量不会特别大),算是一种折中方案,当今最常用的方法之一
每次随机使用若干样本进行参数更新(数量不会特别大),算是一种折中方案,当今最常用的方法之一
...
@@ -3279,7 +3279,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3279,7 +3279,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\begin{pgfonlayer}
{
background
}
\begin{pgfonlayer}
{
background
}
\node
[rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron04)] (layer01)
{}
;
\node
[rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron04)] (layer01)
{}
;
\node
[anchor=east] (layer01label) at (layer01.west)
{
\scriptsize
{
层
$
l
-
1
$}}
;
\node
[anchor=east] (layer01label) at (layer01.west)
{
\scriptsize
{
层
$
k
-
1
$}}
;
\end{pgfonlayer}
\end{pgfonlayer}
%%% layer 2
%%% layer 2
...
@@ -3295,7 +3295,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3295,7 +3295,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\begin{pgfonlayer}
{
background
}
\begin{pgfonlayer}
{
background
}
\node
[rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron11) (neuron14)] (layer02)
{}
;
\node
[rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron11) (neuron14)] (layer02)
{}
;
\node
[anchor=east] (layer02label) at (layer02.west)
{
\scriptsize
{
层
$
l
$}}
;
\node
[anchor=east] (layer02label) at (layer02.west)
{
\scriptsize
{
层
$
k
$}}
;
\end{pgfonlayer}
\end{pgfonlayer}
%%% layer 3
%%% layer 3
...
@@ -3312,7 +3312,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3312,7 +3312,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\begin{pgfonlayer}
{
background
}
\begin{pgfonlayer}
{
background
}
\node
[rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron24)] (layer03)
{}
;
\node
[rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron24)] (layer03)
{}
;
\node
[anchor=east] (layer03label) at (layer03.west)
{
\scriptsize
{
层
$
l
+
1
$}}
;
\node
[anchor=east] (layer03label) at (layer03.west)
{
\scriptsize
{
层
$
k
+
1
$}}
;
\end{pgfonlayer}
\end{pgfonlayer}
%%% output layer
%%% output layer
...
@@ -3329,12 +3329,12 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3329,12 +3329,12 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\begin{pgfonlayer}
{
background
}
\begin{pgfonlayer}
{
background
}
\node
[rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron31) (neuron34)] (layer04)
{}
;
\node
[rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron31) (neuron34)] (layer04)
{}
;
\node
[anchor=east] (layer04label) at (layer04.west)
{
\scriptsize
{
层
$
L
$
(输出)
}}
;
\node
[anchor=east] (layer04label) at (layer04.west)
{
\scriptsize
{
层
$
K
$
(输出)
}}
;
\end{pgfonlayer}
\end{pgfonlayer}
\visible
<2->
{
\visible
<2->
{
\node
[neuronnode,draw=red,fill=red!20!white,inner sep=1pt] (neuron12new) at (2 *
\neuronsep
,3em)
{}
;
\node
[neuronnode,draw=red,fill=red!20!white,inner sep=1pt] (neuron12new) at (2 *
\neuronsep
,3em)
{}
;
\node
[anchor=east] (neuronsamplelabel) at ([yshift=-1em]layer02label.south east)
{
\alert
{
\textbf
{
\tiny
{
第
$
l
$
层, 第
$
i
$
个神经元
}}}}
;
\node
[anchor=east] (neuronsamplelabel) at ([yshift=-1em]layer02label.south east)
{
\alert
{
\textbf
{
\tiny
{
第
$
k
$
层, 第
$
i
$
个神经元
}}}}
;
\draw
[->,dashed,very thick,red] ([xshift=-0.2em,yshift=0.2em]neuronsamplelabel.east) .. controls +(30:1) and +(220:1) .. ([xshift=-0em,yshift=-0em]neuron12new.210);
\draw
[->,dashed,very thick,red] ([xshift=-0.2em,yshift=0.2em]neuronsamplelabel.east) .. controls +(30:1) and +(220:1) .. ([xshift=-0em,yshift=-0em]neuron12new.210);
}
}
...
@@ -3350,17 +3350,17 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3350,17 +3350,17 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
}
}
\visible
<3->
{
\visible
<3->
{
\node
[anchor=west,align=left] (line01) at ([xshift=1em,yshift=1em]layer04.east)
{
\footnotesize
{$
h
_{
i
}^{
k
}$
:第
$
l
$
层, 第
$
i
$
个神经元的输出
}}
;
\node
[anchor=west,align=left] (line01) at ([xshift=1em,yshift=1em]layer04.east)
{
\footnotesize
{$
h
_{
i
}^{
k
}$
:第
$
k
$
层, 第
$
i
$
个神经元的输出
}}
;
\node
[anchor=north west,align=left] (line02) at (line01.south west)
{
\footnotesize
{$
\textbf
{
h
}^{
k
}$
:第
$
k
$
层的输出
}}
;
\node
[anchor=north west,align=left] (line02) at (line01.south west)
{
\footnotesize
{$
\textbf
{
h
}^{
k
}$
:第
$
k
$
层的输出
}}
;
\node
[anchor=north west,align=left] (line03) at (line02.south west)
{
\footnotesize
{$
\textbf
{
s
}^{
k
}$
:第
$
k
$
层的线性变换
$
\textbf
{
s
}^
k
=
\textbf
{
h
}^{
k
-
1
}
\textbf
{
w
}^
k
$}}
;
\node
[anchor=north west,align=left] (line03) at (line02.south west)
{
\footnotesize
{$
\textbf
{
s
}^{
k
}$
:第
$
k
$
层的线性变换
$
\textbf
{
s
}^
k
=
\textbf
{
h
}^{
k
-
1
}
\textbf
{
w
}^
k
$}}
;
\node
[anchor=north west,align=left] (line04) at (line03.south west)
{
\footnotesize
{$
f
^{
k
}$
:第
$
k
$
层的激活函数
$
\textbf
{
h
}^
k
=
f
^
l
(
\textbf
{
s
}^
k
)
$}}
;
\node
[anchor=north west,align=left] (line04) at (line03.south west)
{
\footnotesize
{$
f
^{
k
}$
:第
$
k
$
层的激活函数
$
\textbf
{
h
}^
k
=
f
^
k
(
\textbf
{
s
}^
k
)
$}}
;
}
}
\visible
<4->
{
\visible
<4->
{
\node
[anchor=north west,align=left] (line05) at (line04.south west)
{
\footnotesize
{$
\textbf
{
h
}^{
K
}$
:网络最后的输出
}}
;
\node
[anchor=north west,align=left] (line05) at (line04.south west)
{
\footnotesize
{$
\textbf
{
h
}^{
K
}$
:网络最后的输出
}}
;
}
}
\visible
<5->
{
\visible
<5->
{
\node
[anchor=north west,align=left] (line06) at (line05.south west)
{
\footnotesize
{$
w
_{
j,i
}^{
k
}$
:第
$
k
-
1
$
层神经元
$
j
$
与
}
\\\footnotesize
{
第
$
k
$
层神经元
$
i
$
的连接权重
}}
;
\node
[anchor=north west,align=left] (line06) at (line05.south west)
{
\footnotesize
{$
w
_{
j,i
}^{
k
}$
:第
$
k
-
1
$
层神经元
$
j
$
与
}
\\\footnotesize
{
第
$
k
$
层神经元
$
i
$
的连接权重
}}
;
\node
[anchor=north west,align=left] (line07) at (line06.south west)
{
\footnotesize
{$
\textbf
{
w
}^{
k
}$
:第
$
k
-
1
$
层与第
$
k
1
$
层的
}
\\\footnotesize
{
连接权重
}}
;
\node
[anchor=north west,align=left] (line07) at (line06.south west)
{
\footnotesize
{$
\textbf
{
w
}^{
k
}$
:第
$
k
-
1
$
层与第
$
k
$
层的
}
\\\footnotesize
{
连接权重
}}
;
}
}
\end{scope}
\end{scope}
...
@@ -3646,27 +3646,27 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3646,27 +3646,27 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\visible
<2->
{
\visible
<2->
{
\texttt
{}
\\
\texttt
{}
\\
\texttt
{
CrossEntropyBackward(d
ld
h[4], y, gold);
}
\\
\texttt
{
CrossEntropyBackward(dh[4], y, gold);
}
\\
\texttt
{
SoftmaxBackward(y, s[4], dh[4], ds[4]);
}
\\
\texttt
{
SoftmaxBackward(y, s[4], dh[4], ds[4]);
}
\\
\texttt
{
MMul(h[3],
{
\tiny
X
\_
TRANS
}
, ds[4],
{
\tiny
X
\_
NOTRANS
}
, dw[4]);
}
\\
\texttt
{
MMul(h[3],
{
\tiny
X
\_
TRANS
}
, ds[4],
{
\tiny
X
\_
NOTRANS
}
, dw[4]);
}
\\
\texttt
{
MMul(ds[4],
{
\tiny
X
\_
NOTRANS
}
, w[4],
{
\tiny
X
\_
RANS
}
, dh[3]);
}
\\
}
}
\visible
<3->
{
\visible
<3->
{
\texttt
{}
\\
\texttt
{}
\\
\texttt
{
dh[2] = dh[3];
}
\\
\texttt
{
dh[2] = dh[3];
}
\\
\texttt
{
dh[1] = dh[3];
}
\\
\texttt
{
ReluBackward(h[2], s[2], dh[2], ds[2]);
}
\\
\texttt
{
MMul(h[1],
{
\tiny
X
\_
TRANS
}
, ds[2],
{
\tiny
X
\_
NOTRANS
}
, dw[2]);
}
\\
\texttt
{
MMul(ds[2],
{
\tiny
X
\_
NOTRANS
}
, w[2],
{
\tiny
X
\_
TRANS
}
, dh[2]);
}
\\
}
}
\visible
<4->
{
\visible
<4->
{
\texttt
{}
\\
\texttt
{}
\\
\texttt
{
ReluBackward(h[2], s[2], dh[2], ds[2]);
}
\\
\texttt
{
dh[1] = dh[1] + dh[3];
}
\\
\texttt
{
MMul(h[1],
{
\tiny
X
\_
TRANS
}
, ds[2],
{
\tiny
X
\_
NOTRANS
}
, dw[2]);
}
\\
\texttt
{}
\\
\texttt
{
ReluBackward(h[1], s[1], dh[1], ds[1]);
}
\\
\texttt
{
MMul(h[0],
{
\tiny
X
\_
TRANS
}
, ds[1],
{
\tiny
X
\_
NOTRANS
}
, dw[1]);
}
\\
}
}
\visible
<5->
{
\visible
<5->
{
\texttt
{
...
}
// 继续反向传播
\\
\texttt
{}
\\
\texttt
{}
\\
\texttt
{
for(unsigned i = 0; i < 5; i++)
\{
}
\\
\texttt
{
for(unsigned i = 0; i < 5; i++)
\{
}
\\
\texttt
{}
\ \ \ \
... // 通过
\alert
{
\texttt
{
dw[i]
}}
访问参数的梯度
\\
\texttt
{}
\ \ \ \
... // 通过
\alert
{
\texttt
{
dw[i]
}}
访问参数的梯度
\\
...
@@ -3685,15 +3685,15 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
...
@@ -3685,15 +3685,15 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (h3) at ([yshift=1.5em]h2.north)
{
\tiny
{
h2 = Relu(h1 * w2)
}}
;
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (h3) at ([yshift=1.5em]h2.north)
{
\tiny
{
h2 = Relu(h1 * w2)
}}
;
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (h4) at ([yshift=1.5em]h3.north)
{
\tiny
{
h3 = h2 + h1
}}
;
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (h4) at ([yshift=1.5em]h3.north)
{
\tiny
{
h3 = h2 + h1
}}
;
\visible
<1-
3
>
{
\draw
[->,thick] (h1.north) -- (h2.south);
}
\visible
<1-
4
>
{
\draw
[->,thick] (h1.north) -- (h2.south);
}
\visible
<1-
3
>
{
\draw
[->,thick] (h2.north) -- (h3.south);
}
\visible
<1-
2
>
{
\draw
[->,thick] (h2.north) -- (h3.south);
}
\visible
<1-2>
{
\draw
[->,thick] (h3.north) -- (h4.south);
}
\visible
<1-2>
{
\draw
[->,thick] (h3.north) -- (h4.south);
}
\visible
<1-
2
>
{
\draw
[->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);
}
\visible
<1-
3
>
{
\draw
[->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);
}
\visible
<
4-
>
{
\draw
[<-,very thick,red] (h1.north) -- (h2.south);
}
\visible
<
5
>
{
\draw
[<-,very thick,red] (h1.north) -- (h2.south);
}
\visible
<
4
->
{
\draw
[<-,very thick,red] (h2.north) -- (h3.south);
}
\visible
<
3
->
{
\draw
[<-,very thick,red] (h2.north) -- (h3.south);
}
\visible
<3->
{
\draw
[<-,very thick,red] (h3.north) -- (h4.south);
}
\visible
<3->
{
\draw
[<-,very thick,red] (h3.north) -- (h4.south);
}
\visible
<
3
->
{
\draw
[<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);
}
\visible
<
4
->
{
\draw
[<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);
}
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (slayer) at ([yshift=1.5em]h4.north)
{
\tiny
{
h4 = Softmax(h3 * w4) (output)
}}
;
\node
[anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow=
{
shadow xshift=1pt,shadow yshift=-1pt
}
] (slayer) at ([yshift=1.5em]h4.north)
{
\tiny
{
h4 = Softmax(h3 * w4) (output)
}}
;
\node
[anchor=south] (losslabel) at (slayer.north)
{
\scriptsize
{
\textbf
{
Cross Entropy Loss
}}}
;
\node
[anchor=south] (losslabel) at (slayer.north)
{
\scriptsize
{
\textbf
{
Cross Entropy Loss
}}}
;
...
...
编写
预览
Markdown
格式
0%
重试
或
添加新文件
添加附件
取消
您添加了
0
人
到此讨论。请谨慎行事。
请先完成此评论的编辑!
取消
请
注册
或者
登录
后发表评论