new updates

d5ac0ef2 · xiaotong · 90ed014e · d5ac0ef2 · d5ac0ef2 · d5ac0ef2
Commit d5ac0ef2 authored Jan 04, 2020 by xiaotong
--- a/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
@@ -149,373 +149,24 @@
 \subsection{引入双语句法信息}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  树到树规则抽取
-\begin{frame}{引入双语句法信息}
+%%%  翻译特征
+\begin{frame}{特征}
 \begin{itemize}
-\item 对于树到树模型，源语和目标语端都有句法树，需要使用树片段到树片段的映射来描述翻译过程，这种映射关系被描述为树到树翻译规则。这里，把\\
-\vspace{-1.3em}
-\begin{eqnarray}
-\langle\ \textrm{VP}, \textrm{VP}\ \rangle & \to & \langle\ \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})), \nonumber \\
-& & \ \ \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1}))\ \rangle \nonumber
-\end{eqnarray}
-表示为\alert{树片段到树片段}的映射形式\\
-\vspace{-1.3em}
-\begin{eqnarray}
-& & \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})) \nonumber \\
-& \to & \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1})) \nonumber
-\end{eqnarray}
-
-\item<2-> 可以通过扩展GHKM方法进行树到树规则抽取
+\item 与短语和层次短语模型一样，句法模型也使用判别式模型进行建模 - $\textrm{P}(d,\textbf{t}|\textbf{s}) = \frac{\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))}{\sum_{d',t'}\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d',\textbf{s},\textbf{t}'))}$。其中特征权重$\{\lambda_i\}$可以使用最小错误率训练进行调优，特征函数$\{h_i\}$需要用户定义。
+\item<2-> 这里，所有规则满足$\langle\  \alpha_h, \beta_h\ \rangle \to \langle\ \alpha_r, \beta_r, \sim\ \rangle$的形式
    \begin{itemize}
-    \item 双语端进行可信节点的识别，之后找到节点之间的对应
-    \item 基于对应的节点获得树片段的对应，即抽取树到树规则
-    \item 规则组合、SPMT等方法同样适用
-    \end{itemize}
-
-\end{itemize}
-
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
-%%%  方法1：利用词对齐归纳句法映射
-\begin{frame}{方法1：利用词对齐归纳树到树规则}
-\begin{itemize}
-\item 简单直接的方法是把GHKM方法扩展到双语的情况，利用词对齐归纳树到树映射
-    \begin{itemize}
-    \item<3-> 但是词对齐的错误往往会导致很多规则无法抽取
-    \end{itemize}
-\end{itemize}
-
-\begin{minipage}[c][5cm][t]{0.47\textwidth}
-\begin{center}
-\begin{tikzpicture}
-\begin{scope}
-\begin{scope}[scale=0.65, level distance=27pt]
-\Tree[.S
-        [.NP
-            [.DT \node(ew1){the}; ]
-            [.NNS \node(ew2){imports}; ]
-        ]
-        [.VP
-            [.VBZ \node(ew3){have}; ]
-            [.ADVP
-                [.RB \node(ew4){drastically}; ]
-                [.VBN \node(ew5){fallen}; ]
-            ]
-        ]
-     ]
-\end{scope}
-
-\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
-\Tree[.IP
-        [.NN \node(cw1){进口}; ]
-        [.VP
-            [.AD \node(cw2){大幅度}; ]
-            [.VP
-                [.VV \node(cw3){下降}; ]
-                [.AS \node(cw4){了}; ]
-            ]
-        ]
-     ]
-\end{scope}
-
-\visible<2->{
-\draw[-, dashed] (cw1) -- (ew2);
-\draw[-, dashed] (cw2) -- (ew4);
-\draw[-, dashed] (cw3) -- (ew5);
-\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
-}
-
-\visible<3->{
-\draw[-, red, dashed,thick] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
-}
-
-
-\end{scope}
-\end{tikzpicture}
-\end{center}
-\end{minipage}
-\begin{minipage}[c][5cm][t]{0.50\textwidth}
-\visible<2->{
-\begin{tabular}{l l}
-\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则}}} \\
-\hline
-\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
-\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
-\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
-\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
-\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
-\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
-\end{tabular}
-}
-
-\visible<3->{
-\vspace{0.5em}
-
-\begin{tabular}{l l}
-\multicolumn{2}{l}{\textbf{\scriptsize{无法得到的规则}}} \\
-\hline
-\scriptsize{$r_{?}$} & \scriptsize{AS(了) $\rightarrow$ VBZ(have)} \\
-\scriptsize{$r_{?}$} & \scriptsize{NN(进口) $\rightarrow$} \\
-                     & \scriptsize{NP(DT(the) NNS(imports))} \\
-\scriptsize{$r_{?}$} & \scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)} \\
-\end{tabular}
-}
-\end{minipage}
-
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
-%%%  方法2：直接进行节点对齐然后归纳句法映射
-\begin{frame}{方法2：利用节点对齐抽取树到树规则}
-\begin{itemize}
-\item 另一种思路是直接获取源语言树节点到目标语树节点的对应关系，然后直接抽取规则，这样可避免词对齐错误
-    \begin{itemize}
-    \item 节点对其可以更准确的捕捉双语结构的对应
-    \end{itemize}
-\end{itemize}
-
-\begin{minipage}[c][5cm][t]{0.47\textwidth}
-\begin{center}
-\begin{tikzpicture}
-
-\only<1>{
-\begin{scope}
-\begin{scope}[scale=0.65, level distance=27pt]
-\Tree[.S
-        [.NP
-            [.DT \node(ew1){the}; ]
-            [.NNS \node(ew2){imports}; ]
-        ]
-        [.VP
-            [.VBZ \node(ew3){have}; ]
-            [.ADVP
-                [.RB \node(ew4){drastically}; ]
-                [.VBN \node(ew5){fallen}; ]
-            ]
-        ]
-     ]
-\end{scope}
-
-\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
-\Tree[.IP
-        [.NN \node(cw1){进口}; ]
-        [.VP
-            [.AD \node(cw2){大幅度}; ]
-            [.VP
-                [.VV \node(cw3){下降}; ]
-                [.AS \node(cw4){了}; ]
-            ]
-        ]
-     ]
-\end{scope}
-
-\draw[-, dashed] (cw1) -- (ew2);
-\draw[-, dashed] (cw2) -- (ew4);
-\draw[-, dashed] (cw3) -- (ew5);
-\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
-
-\end{scope}
-}
-
-\begin{scope}
-
-\visible<2->{
-\begin{scope}[scale=0.65, level distance=27pt]
-\Tree[.\node[draw](en1){S};
-        [.\node[draw](en2){NP};
-            [.DT the ]
-            [.NNS imports ]
-        ]
-        [.\node[draw](en3){VP};
-            [.\node[draw](en4){VBZ}; have ]
-            [.ADVP
-                [.\node[draw](en5){RB}; drastically ]
-                [.\node[draw](en6){VBN}; fallen ]
-            ]
-        ]
-     ]
-\end{scope}
-
-\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
-\Tree[.\node[draw](cn1){\ \ IP\ \ };
-        [.\node[draw](cn2){NN}; 进口 ]
-        [.\node[draw](cn3){VP};
-            [.\node[draw](cn4){AD}; 大幅度 ]
-            [.VP
-                [.\node[draw](cn5){VV}; 下降 ]
-                [.\node[draw](cn6){AS}; 了 ]
-            ]
-        ]
-     ]
-\end{scope}
-}
-
-\visible<3->{
-\draw[latex-latex, dotted, thick, red] (cn4.east) .. controls +(east:0.5) and +(west:0.5) .. (en5.west);
-\draw[latex-latex, dotted, thick, red] (cn5.east) .. controls +(east:0.5) and +(south:0.5) .. (en6.south west);
-\draw[latex-latex, dotted, thick, red] (cn6.north west) .. controls +(north:1.5) and +(south:2.5) .. (en4.south west);
-\draw[latex-latex, dotted, thick, red] (cn3.north west) -- (en3.south west);
-\draw[latex-latex, dotted, thick, red] (cn2.west) .. controls +(west:0.6) and +(west:0.6) .. (en2.west);
-\draw[latex-latex, dotted, thick, red] (cn1.north west) .. controls +(north:4) and +(south:5.5) .. (en1.south west);
-}
-
-\end{scope}
-
-\end{tikzpicture}
-\end{center}
-\end{minipage}
-\begin{minipage}[c][5cm][t]{0.50\textwidth}
-\only<1>{
-\begin{tabular}{l l}
-\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(词对齐)}}} \\
-\hline
-\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
-\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
-\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
-\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
-\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
-\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
-\end{tabular}
-}
-
-\visible<4->{
-\begin{tabular}{l l}
-\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(子树对齐)}}} \\
-\hline
-{\color{gray!70} \scriptsize{$r_1$}} & {\color{gray!70} \scriptsize{AS(了) $\rightarrow$ DT(the)}} \\
-{\color{gray!70} \scriptsize{$r_2$}} & {\color{gray!70}\scriptsize{NN(进口) $\rightarrow$ NNS(imports)}} \\
-\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
-\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
-{\color{gray!70} \scriptsize{$r_5$}} & {\color{gray!70} \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$}} \\
-\multicolumn{2}{l}{{\color{gray!70} \tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}}} \\
-\alert{\scriptsize{$r_6$}} & \alert{\scriptsize{AS(了) $\rightarrow$ VBZ(have)}} \\
-\alert{\scriptsize{$r_7$}} & \alert{\scriptsize{NN(进口) $\rightarrow$ }} \\
-                           & \alert{\scriptsize{NP(DT(the) NNS(imports))}}\\
-\alert{\scriptsize{$r_8$}} & \alert{\scriptsize{VP(AD$_1$ VP(VV$_2$ AS$_3$)) $\rightarrow$}} \\
-                           & \alert{\scriptsize{VP(VBZ$_3$ ADVP(RB$_1$ VBN$_2$)}} \\
-\alert{\scriptsize{$r_9$}} & \alert{\scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)}} \\
-\end{tabular}
-}
-\end{minipage}
-
-\end{frame}
-
-%%%------------------------------------------------------------------------------------------------------------
-%%%  抽取更多的规则：节点对齐矩阵
-\begin{frame}{节点对齐矩阵}
-\begin{itemize}
-\item 节点对齐的自动获取：1）基于分类模型的方法；2）无指导节点对齐的方法
-\item 使用节点对齐的另一个好处是，我们可以直接用节点对齐矩阵进行规则抽取，而不是用单一的对齐结果
-    \begin{itemize}
-    \item 对齐矩阵可以帮助抽取更多样的规则
+    \item $\alpha_h$和$\beta_h$是规则左部的源语和目标语部分，对应树结构的根节点
+    \item $\alpha_r$和$\beta_r$是规则右部的源语和目标语部分，对应树结构
+    \item $\sim$表示$\alpha_r$和$\beta_r$中叶子非终结符的对应
+    \item 此外，定义$r(\alpha_r)$和$r(\beta_r)$为源语和目标语树结构的叶子节点序列。例如，对于规则$\langle\ \textrm{VP}, \textrm{VP}\ \rangle \to \langle\ \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})), \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1}))$，有 \\
+        
+        \vspace{-1.5em}
+        \begin{eqnarray}
+        r(\alpha_r) & = & \textrm{PP}_1\ \textrm{表示 NN}_2 \nonumber \\
+        r(\beta_r) & = & \textrm{was}\ \textrm{VBZ}_2\ \textrm{PP}_1\nonumber
+        \end{eqnarray}
    \end{itemize}
 \end{itemize}
-
-\vspace{-0.2em}
-\centering
-\begin{tikzpicture}
-
-\begin{scope}[scale=0.7]
-
-\begin{scope}[sibling distance=17pt, level distance=25pt]
-\Tree[.\node(en1){VP$^{[1]}$};
-        [.\node(en2){VBZ$^{[2]}$}; have ]
-        [.\node(en3){ADVP$^{[3]}$};
-            [.\node(en4){RB$^{[4]}$}; drastically ]
-            [.\node(en5){VBN$^{[5]}$}; fallen ]
-        ]
-     ]
-\end{scope}
-
-\begin{scope}[grow'=up, yshift=-2.7in, sibling distance=32pt, level distance=25pt]
-\Tree[.\node(cn1){VP$^{[1]}$};
-        [.\node(cn2){AD$^{[2]}$}; 大幅度 ]
-        [.\node(cn3){VP$^{[3]}$};
-            [.\node(cn4){VV$^{[4]}$}; 下降 ]
-            [.\node(cn5){AS$^{[5]}$}; 了 ]
-        ]
-     ]
-\end{scope}
-
-\begin{scope}[xshift=1.7in, yshift=-0.4in]
-\node[anchor=west, rotate=60] at (0.8,-0.6) {VP$^{[1]}$};
-\node[anchor=west, rotate=60] at (1.8,-0.6) {VBZ$^{[2]}$};
-\node[anchor=west, rotate=60] at (2.8,-0.6) {ADVP$^{[3]}$};
-\node[anchor=west, rotate=60] at (3.8,-0.6) {RB$^{[4]}$};
-\node[anchor=west, rotate=60] at (4.8,-0.6) {VBN$^{[5]}$};
-
-\node[] at (6.2,-1) {VP$^{[1]}$};
-\node[] at (6.2,-2) {AD$^{[2]}$};
-\node[] at (6.2,-3) {VP$^{[3]}$};
-\node[] at (6.2,-4) {VV$^{[4]}$};
-\node[] at (6.2,-5) {AS$^{[5]}$};
-
-\foreach \i in {1,...,5}{
-    \foreach \j in {-5,...,-1}{
-        \node[fill=blue,scale=0.2] at (\i,\j) {};
-    }
-}
-
-\visible<2-3>{
-\node[fill=blue, scale=1.2] at (1,-1) {};
-\node[fill=blue, scale=1.2] at (4,-2) {};
-\node[fill=blue, scale=1.2] at (2,-5) {};
-}
-
-\visible<2>{
-\node[fill=blue, scale=1.2] at (5,-4) {};
-}
-
-\visible<3>{
-\node[fill=red, scale=1.2] at (5,-4) {};
-}
-
-\visible<4-5>{
-\node[fill=blue, scale=1.1] at (1,-1) {};
-\node[fill=blue, scale=0.5] at (1,-3) {};
-\node[fill=blue, scale=0.6] at (2,-2) {};
-\node[fill=blue, scale=0.7] at (2,-3) {};
-\node[fill=blue, scale=0.7] at (2,-5) {};
-\node[fill=blue, scale=0.4] at (3,-1) {};
-\node[fill=blue, scale=0.6] at (3,-2) {};
-\node[fill=blue, scale=0.5] at (3,-3) {};
-\node[fill=blue, scale=0.9] at (4,-2) {};
-\node[fill=blue, scale=0.7] at (5,-3) {};
-\node[fill=blue, scale=0.4] at (5,-5) {};
-}
-
-\visible<4>{
-\node[fill=blue, scale=0.6] at (3,-4) {};
-\node[fill=blue, scale=0.8] at (5,-4) {};
-}
-
-\visible<5>{
-\node[fill=red, scale=0.6] at (3,-4) {};
-\node[fill=red, scale=0.8] at (5,-4) {};
-}
-
-\visible<2-3>{
-\node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = extractable node-pair}};
-}
-
-\visible<4-5>{
-\node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = possible alignment}};
-}
-
-\end{scope}
-
-\visible<3>{\draw[<->, red, thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
-
-\visible<5>{\draw[<->, red, dotted, very thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
-\visible<5>{\draw[<->, red, dotted, very thick] (cn4.west) .. controls +(west:1.0) and +(west:2) .. (en3.west);}
-
-
-
-\end{scope}
-
-\end{tikzpicture}
-
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -4287,6 +4287,8 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
 \end{scope}

 \begin{scope}[xshift=1.7in, yshift=-0.4in]
+
+{\footnotesize
 \node[anchor=west, rotate=60] at (0.8,-0.6) {VP$^{[1]}$};
 \node[anchor=west, rotate=60] at (1.8,-0.6) {VBZ$^{[2]}$};
 \node[anchor=west, rotate=60] at (2.8,-0.6) {ADVP$^{[3]}$};
@@ -4298,6 +4300,7 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
 \node[] at (6.2,-3) {VP$^{[3]}$};
 \node[] at (6.2,-4) {VV$^{[4]}$};
 \node[] at (6.2,-5) {AS$^{[5]}$};
+}

 \foreach \i in {1,...,5}{
    \foreach \j in {-5,...,-1}{
@@ -4370,8 +4373,23 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$

 %%%------------------------------------------------------------------------------------------------------------
 %%%  翻译特征
-\begin{frame}{翻译特征}
-% NiuTrans Manual
+\begin{frame}{特征}
+\begin{itemize}
+\item 与短语和层次短语模型一样，句法模型也使用判别式模型进行建模 - $\textrm{P}(d,\textbf{t}|\textbf{s}) = \frac{\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))}{\sum_{d',t'}\exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d',\textbf{s},\textbf{t}'))}$。其中特征权重$\{\lambda_i\}$可以使用最小错误率训练进行调优，特征函数$\{h_i\}$需要用户定义。
+\item<2-> 这里，所有规则满足$\langle\  \alpha_h, \beta_h\ \rangle \to \langle\ \alpha_r, \beta_r, \sim\ \rangle$的形式
+    \begin{itemize}
+    \item $\alpha_h$和$\beta_h$是规则左部的源语和目标语部分，对应树结构的根节点
+    \item $\alpha_r$和$\beta_r$是规则右部的源语和目标语部分，对应树结构
+    \item $\sim$表示$\alpha_r$和$\beta_r$中叶子非终结符的对应
+    \item 此外，定义$r(\alpha_r)$和$r(\beta_r)$为源语和目标语树结构的叶子节点序列。例如，对于规则$\langle\ \textrm{VP}, \textrm{VP}\ \rangle \to \langle\ \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})), \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1}))$，有 \\
+
+        \vspace{-1.5em}
+        \begin{eqnarray}
+        r(\alpha_r) & = & \textrm{PP}_1\ \textrm{表示 NN}_2 \nonumber \\
+        r(\beta_r) & = & \textrm{was}\ \textrm{VBZ}_2\ \textrm{PP}_1\nonumber
+        \end{eqnarray}
+    \end{itemize}
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -1600,7 +1600,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \end{center}

 {\scriptsize\begin{tabular}{l}
-    *$x_t$: 上一层的输出，$h_t$: 同一层上一时刻的隐藏状态\\
+    *$x_t$: 前一层的输出，$h_t$: 同一层上一时刻的隐藏状态\\
    *$c_t$: 同一层上一时刻的记忆
 \end{tabular}}
 \end{frame}
@@ -2546,7 +2546,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
        \item 翻译出``world''的时候``世界''的权重很大
    \end{enumerate}
    \item 互译的词通常都会产生较大的注意力权重
-    \item 注意力的权重包含了词对齐的信息
+    \item 注意力的权重一定程度上反应了词语间的对应关系
    \end{itemize}
    \begin{center}
        \hspace*{\fill}
@@ -2823,8 +2823,8 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
        \begin{itemize}
            \item LSTM遗忘门偏置初始为1，也就是始终选择遗忘记忆$c$，可以有效防止初始时$c$里包含的错误信号传播后面所有时刻
            \item 网络的其他偏置一般都初始化成0，可以有效防止加入过大或过小的偏置后使得激活函数的输出跑到``饱和区''，也就是梯度接近0的区域，使得训练一开始就无法跳出局部极小
-            \item 网络的权重矩阵$W$一般使用Xavier参数初始化方法，可以有效稳定训练过程，特别是对于比较``深''的网络$$W \sim \mathcal{U}(-\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}},\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}})$$
-            \item $d_{\mathrm{in}}$和$d_{\mathrm{out}}$分别是$W$的输入和输出的维度大小，经典的论文\\
+            \item<2-> 网络的权重矩阵$W$一般使用Xavier参数初始化方法，可以有效稳定训练过程，特别是对于比较``深''的网络$$W \sim \mathcal{U}(-\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}},\sqrt{\frac{6}{d_{\mathrm{in}}+d_{\mathrm{out}}}})$$
+            $d_{\mathrm{in}}$和$d_{\mathrm{out}}$分别是$W$的输入和输出的维度大小，参考论文\\
            \textbf{Understanding the difficulty of training deep feedforward neural networks}\\
            \textbf{Glorot, X., \& Bengio, Y., 2010, In Proc of AISTATS}
        \end{itemize}
@@ -2844,7 +2844,7 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
            \end{tabular}
        \end{center}
        \item 因此需要快速得到模型看一下初步效果，选择Adam
-        \item 若是需要在一个任务上得到最优的结果，选择SGD
+        \item<2-> 若是需要在一个任务上得到最优的结果，选择SGD
        \begin{itemize}
            \item 需要注意的是，训练RNN的时候，我们通常会遇到梯度爆炸的问题，也就是梯度突然变得很大，这种情况下需要使用``梯度裁剪''来防止梯度$\pi$超过阈值$$\pi'=\pi \cdot \frac{\mathrm{threshold}}{\max(\mathrm{threshold},\parallel \pi \parallel_2)}$$
            \item 其中$\mathrm{threshold}$是手工设定的梯度大小阈值，$\parallel \cdot \parallel_2$是L2范数
@@ -2858,9 +2858,11 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
        \item 不同优化器需要的学习率不同，比如Adam一般使用$0.001$或$0.0001$，而SGD则在$0.1\sim 1$之间挑选
        \item 但是无论使用哪个优化器，为了保证训练又快又好，我们通常都需要根据当前的更新次数来调整学习率的大小
        \begin{itemize}
-            \item 学习率预热：模型训练初期，梯度通常很大，直接使用很大的学习率很容易让模型跑偏，因此需要学习率有一个从小到大的过程
-            \item 学习率衰减：模型训练接近收敛的时候，使用大学习率会很容易让模型错过局部极小，因此需要学习率逐渐变小来逼近局部最小
+            \item<2-> 学习率预热：模型训练初期，梯度通常很大，直接使用很大的学习率很容易让模型跑偏，因此需要学习率有一个从小到大的过程
+            \item<2-> 学习率衰减：模型训练接近收敛的时候，使用大学习率会很容易让模型错过局部极小，因此需要学习率逐渐变小来逼近局部最小
        \end{itemize}
+        
+        \visible<2->{
        \begin{center}
            \begin{tikzpicture}
            \footnotesize{
@@ -2880,14 +2882,15 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
            }
            \end{tikzpicture}
        \end{center}
+        }
    \end{itemize}
 \end{frame}

 \begin{frame}{训练 - 加速}
    \begin{itemize}
-        \item 万事俱备，只是为什么训练这么慢？\visible<2>{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
-        \item 我有钱，是不是多买几台设备会更快？\visible<2>{\alert{- 可以，但是需要技巧，而且也不是无限增长的}}
-        \item<2> 使用多个设备并行计算进行加速的两种方法
+        \item 万事俱备，只是为什么训练这么慢？\visible<2->{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
+        \item 我有钱，是不是多买几台设备会更快？\visible<2->{\alert{- 可以，但是需要技巧，而且也不是无限增长的}}
+        \item<3> 使用多个设备并行计算进行加速的两种方法
        \begin{itemize}
            \item 数据并行：把``输入''分到不同设备上并行计算
            \item 模型并行：把``模型''分到不同设备上并行计算
@@ -2901,7 +2904,8 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
                模型并行 & \specialcell{l}{可以对很大的模型进行\\运算} & \specialcell{l}{只能有限并行，比如\\多少层就多少个设备} \\
            \end{tabular}
        \end{center}
-        \item<2> 这两种方法可以一起使用！！！
+        \vspace{0.5em}
+        \item<3> 这两种方法可以一起使用！！！
    \end{itemize}
 \end{frame}

@@ -4578,7 +4582,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 }
 \visible<3->{
 \filldraw [fill=blue!20,draw,thick,fill opacity=0.85] ([xshift=-0.9em,yshift=0.5em]a15.north west) -- ([xshift=0.5em,yshift=-0.9em]a51.south east) --  ([xshift=0.5em,yshift=0.5em]a55.north east) -- ([xshift=-0.9em,yshift=0.5em]a15.north west);
-\node[anchor=west] (labelmask) at ([xshift=0.3em,yshift=0.5em]a23.north east) {Mask};
+\node[anchor=west] (labelmask) at ([xshift=0.3em,yshift=0.5em]a23.north east) {Masked};
 \node [rounded corners=0.3em,anchor=west,fill=blue!20] (mask) at ([xshift=0.1em]add.east) {\large{$Mask$}};
 }