Merge branch 'master' of http://47.105.50.196/NiuTrans/Toy-MT-Introduction

b8e87296 · Lee · c6facf66 · 8884b1c0 · b8e87296 · b8e87296
Commit b8e87296 authored Dec 20, 2019 by Lee
--- a/Section06-Neural-Machine-Translation/section06-test.tex
+++ b/Section06-Neural-Machine-Translation/section06-test.tex
@@ -145,119 +145,70 @@
 \subsection{注意力机制}

 %%%------------------------------------------------------------------------------------------------------------
-\begin{frame}{模块1：词嵌入层}
+%%% 做个小结
+\begin{frame}{我们赶上了好时代 ...}
 \begin{itemize}
-\item 词嵌入
+\item 神经机器翻译的火爆这几年有目共睹，好事情！！！
+    \begin{itemize}
+    \item https://arxiv.org上搜索neural machine translation
+    \item ACL、EMNLP等顶会神经机器翻译论文数量近些年几乎呈线性增长
+    \item 神经机器翻译系统在各大比赛中霸榜，开源机器翻译满天飞，大厂秀肌肉，小作坊刷存在感
+    \end{itemize}
+    \vspace{0.3em}
+\item<2-> 这里只介绍了最基本的概念，NMT的内容远不止这些
+    \begin{itemize}
+    \item 各种专题：解码、压缩、先验知识、低资源翻译、无指导方法、篇章级翻译等等等等
+    \item 推荐一个survey，有些基础的可以参考一下，很全面 \\
+    ``Neural Machine Translation: A Review'' by Felix Stahlberg\\
+    \url{https://arxiv.org/abs/1912.02047}
+    \item 如何搭建一个优秀的NMT系统？- 有许多技巧 \\
+             下一章介绍
+    \item 回忆一下第一章介绍的NMT开源系统，可以试试
+    \end{itemize}
 \end{itemize}
-%%% 图
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% last slide
+\begin{frame}{结束~}
+
+\vspace{2em}
+
 \begin{center}
-    \hspace*{-0.6cm}
-    \begin{tikzpicture}
-        \setlength{\base}{0.9cm}
-
-        \tikzstyle{rnnnode} = [rounded corners=1pt,minimum height=0.5\base,minimum width=1\base,draw,inner sep=0pt,outer sep=0pt]
-        \tikzstyle{wordnode} = [font=\tiny]
-
-        % RNN translation model
-        \begin{scope}[local bounding box=RNNMT]
-            % RNN Encoder
-            \coordinate (eemb0) at (0,0);
-            \foreach \x [count=\y from 0] in {1,2,...,3}
-                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=west] (eemb\x) at ([xshift=0.4\base]eemb\y.east) {\tiny{$e_x()$}};
-            \foreach \x in {1,2,...,3}
-                \node[rnnnode,fill=blue!30!white,anchor=south] (enc\x) at ([yshift=0.3\base]eemb\x.north) {};
-                \node[] (enclabel1) at (enc1) {\tiny{$\textbf{h}_{m-2}$}};
-                \node[] (enclabel2) at (enc2) {\tiny{$\textbf{h}_{m-1}$}};
-                \node[rnnnode,fill=purple!30!white] (enclabel3) at (enc3) {\tiny{$\textbf{h}_{m}$}};
-            \node[wordnode,left=0.4\base of enc1] (init1) {$\cdots$};
-            \node[wordnode,left=0.4\base of eemb1] (init2) {$\cdots$};
-
-            \node[wordnode,below=0pt of eemb1] () {走};
-            \node[wordnode,below=0pt of eemb2] () {吗};
-            \node[wordnode,below=0pt of eemb3] () {$\langle$eos$\rangle$};
-
-            % RNN Decoder
-            \foreach \x in {1,2,...,3}
-                \node[rnnnode,minimum height=0.5\base,fill=green!30!white,anchor=south] (demb\x) at ([yshift=\base]enc\x.north) {\tiny{$e_y()$}};
-            \foreach \x in {1,2,...,3}
-                \node[rnnnode,fill=blue!30!white,anchor=south] (dec\x) at ([yshift=0.3\base]demb\x.north) {{\tiny{$\textbf{s}_\x$}}};
-            \foreach \x in {1,2,...,3}
-                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.3\base]dec\x.north) {\tiny{Softmax}};
-            \node[wordnode,right=0.4\base of demb3] (end1) {$\cdots$};
-            \node[wordnode,right=0.4\base of dec3] (end2) {$\cdots$};
-            \node[wordnode,right=0.4\base of softmax3] (end3) {$\cdots$};
-
-            % Decoder input words
-            \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
-            \ExtractX{$(demb2.south)$}
-            \ExtractY{$(decwordin.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
-            \ExtractX{$(demb3.south)$}
-            \ExtractY{$(decwordin.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
-
-            % Decoder output words
-            \node[wordnode,above=0pt of softmax1] (decwordout) {Do};
-            \ExtractX{$(softmax2.north)$}
-            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {you};
-            \ExtractX{$(softmax3.north)$}
-            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {know};
-
-            % Connections
-            \draw[-latex'] (init1.east) to (enc1.west);
-            \draw[-latex'] (dec3.east) to (end2.west);
-            \foreach \x in {1,2,...,3}
-                \draw[-latex'] (eemb\x) to (enc\x);
-            \foreach \x in {1,2,...,3}
-                \draw[-latex'] (demb\x) to (dec\x);
-            \foreach \x in {1,2,...,3}
-                \draw[-latex'] (dec\x.north) to (softmax\x.south);
-            \foreach \x [count=\y from 2] in {1,2}
-            {
-                \draw[-latex'] (enc\x.east) to (enc\y.west);
-                \draw[-latex'] (dec\x.east) to (dec\y.west);
-            }
-
-            \coordinate (bridge) at ([yshift=0.4\base]enc2.north west);
-            \draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
-        \end{scope}
-
-        \begin{scope}
-            \coordinate (start) at (5.8\base,0.3\base);
-            \node [anchor=south west] (one) at (start) {\scriptsize{$\begin{bmatrix} 0 \\ 0 \\ 0 \\ \vdots \\ 0 \\ {\color{ugreen} 1} \\ 0 \\ 0 \end{bmatrix}$}};
-            \node [anchor=north] (w) at ([yshift=3pt]one.south) {\scriptsize{\color{ugreen} you}};
-            \node [anchor=north west] (words) at ([xshift=10pt]one.north east) {\scriptsize{$\begin{matrix} \langle\textrm{eos}\rangle \\ \langle\textrm{sos}\rangle \\ \textrm{Do} \\ \vdots \\ \textrm{know} \\ \textrm{you} \\ \textrm{?} \\ \textrm{have} \end{matrix}$}};
-            \node [anchor=north west] (mat) at ([xshift=-6pt]words.north east) {\scriptsize{$
-                \begin{bmatrix} 
-                    .1 & -4 & \cdots & 2 \\ 
-                    5 & 2 & \cdots & .2 \\ 
-                    2 & .1 & \cdots & .3 \\ 
-                    \vdots & \vdots & \ddots & \vdots \\ 
-                    0 & .8 & \cdots & 4 \\ 
-                    -1 & -2 & \cdots & -3 \\ 
-                    .7 &  .5 & \cdots & 3 \\ 
-                    -2 & .3 & \cdots & .1
-                \end{bmatrix}$
-            }};
-
-            \begin{pgfonlayer}{background}
-                \node [draw=ugreen,fill=green!20!white,rounded corners=0.3em,minimum width=3.8cm,minimum height=0.9em,anchor=south west] (emb) at ([shift={(1.25cm,0.8cm)}]start) {};
-            \end{pgfonlayer}
-
-            \draw [decorate,decoration={brace,mirror}] ([shift={(6pt,2pt)}]mat.south west) to node [auto,swap,font=\scriptsize] {词嵌入矩阵} ([shift={(-6pt,2pt)}]mat.south east);
-
-            \draw [-latex'] ([xshift=-2pt,yshift=-0.65cm]one.east) to ([yshift=-0.65cm]words.west);
-            \draw [-latex'] (emb.east) -| ([yshift=0.4cm]mat.north east);
-            \draw [-latex'] ([yshift=-0.4cm]w.south) to ([yshift=2pt]w.south);
-
-            \node [draw=ugreen,densely dashed,thick,rounded corners=3pt,fit=(one) (words) (mat) (w)] (input) {};
-        \end{scope}
-
-        \draw [->,thick,densely dashed,ugreen] ([yshift=-0.2em]demb3.east) to [out=0,in=180] ([yshift=-1cm]input.west);
-    \end{tikzpicture}
+
+\begin{tikzpicture}
+
+\begin{scope}
+\small{
+\node [anchor=south west,minimum width=15em] (source) at (0,0) {\Large{\textbf{source}: 谢谢 大家 ！}};
+\node [anchor=south west,minimum width=15em] (target) at ([yshift=12em]source.north west) {\Large{\textbf{target}: Thank You !}};
+\node [anchor=center,minimum width=9.6em,minimum height=1.8em,draw,rounded corners=0.3em] (hidden) at ([yshift=6em]source.north) {};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!20] (cell01) at ([xshift=0.2em]hidden.west) {\footnotesize{.2}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell02) at (cell01.east) {\footnotesize{-1}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!70] (cell03) at (cell02.east) {\footnotesize{6}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!50] (cell04) at (cell03.east) {\footnotesize{5}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!30] (cell05) at (cell04.east) {\footnotesize{.7}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell06) at (cell05.east) {\footnotesize{-2}};
+
+\filldraw [fill=red!20,draw=white] (source.north west) -- (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
+\filldraw [fill=blue!20,draw=white] (target.south west) -- (target.south east) -- ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- ([xshift=0.2em,yshift=0.1em]hidden.north west);
+
+\draw [->,thick] (source.north west) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
+\draw [->,thick] (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east);
+\draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
+\draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
+}
+
+\node [anchor=south] (enclabel) at ([yshift=2em]source.north) {\large{Encoder}};
+\node [anchor=north] (declabel) at ([yshift=-2em]target.south) {\large{Decoder}};
+\end{scope}
+
+\end{tikzpicture}
+
+\vspace{2em}
+
 \end{center}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -1133,10 +1133,14 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 %%% 词嵌入
 \begin{frame}{模块1：词嵌入层}
 \begin{itemize}
-\item 把输入的词转换成唯一对应的词表大小的0-1向量
-\item 根据0-1向量，从词嵌入矩阵中取出对应的词嵌入$e_y$
-\item 取出的词嵌入$e_y$作为循环神经网络的输入
+\item 词嵌入的作用是把离散化的单词表示转换为连续空间上的分布式表示
+    \begin{itemize}
+    \item 把输入的词转换成唯一对应的词表大小的0-1向量
+    \item 根据0-1向量，从词嵌入矩阵中取出对应的词嵌入$e()$
+    \item 取出的词嵌入$e()$作为循环神经网络的输入
+\end{itemize}
 \end{itemize}
+\vspace{-1em}
 %%% 图
 \begin{center}
    \hspace*{-0.6cm}
@@ -1237,8 +1241,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \draw [decorate,decoration={brace,mirror}] ([shift={(6pt,2pt)}]mat.south west) to node [auto,swap,font=\scriptsize] {词嵌入矩阵} ([shift={(-6pt,2pt)}]mat.south east);

            \draw [-latex'] ([xshift=-2pt,yshift=-0.65cm]one.east) to ([yshift=-0.65cm]words.west);
-            \draw [-latex'] (emb.east) -| ([yshift=0.4cm]mat.north east);
+            \draw [-latex'] (emb.east) -| ([yshift=0.4cm]mat.north east) node [pos=1,above] {\scriptsize{RNN输入}};
            \draw [-latex'] ([yshift=-0.4cm]w.south) to ([yshift=2pt]w.south);
+            \node [anchor=north] (wlabel) at ([yshift=-0.6em]w.south) {\scriptsize{输入的单词}};

            \node [draw=ugreen,densely dashed,thick,rounded corners=3pt,fit=(one) (words) (mat) (w)] (input) {};
        \end{scope}
@@ -1252,9 +1257,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 %%% 输出
 \begin{frame}{模块2：输出层}
 \begin{itemize}
-\item 循环网络输出$s$经过权重矩阵$W$变换成词表大小的向量
-\item 获得的向量经过Softmax变换得到不同词作为输出的概率
-\item 一般选取概率最高的词作为模型最终的输出
+\item 输出层需要得到每个目标语单词的生成概率，进而选取概率最高的词作为输出。但RNN中的隐藏层并不会输出单词概率，而是输出$s$，其每一行对应一个单词表示
+    \begin{itemize}
+    \item $s$经过权重矩阵$W$变成$\hat{s}$，其隐藏层维度变换成词表的大小
+    \item $\hat{s}$经过Softmax变换得到不同词作为输出的概率，即单词$i$的概率$p_i = \textrm{Softmax}(i) = \frac{e^{\hat{s}_i}}{\sum_{j} e^{\hat{s}_{j}}} $
+    \end{itemize}
 \end{itemize}
 %%% 图
 \begin{center}
@@ -1391,16 +1398,9 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 %%%------------------------------------------------------------------------------------------------------------
 %%% LSTM
 \begin{frame}{模块3：循环单元 - 长短时记忆模型(LSTM)}
-\only<1>{遗忘门根据输入的$x_t$和$h_t$决定保留多大比例的$c_t$\\[5pt]}
-\only<2>{输入门根据$x_t$和$h_t$计算需要保存的记忆$\hat{c}_t$和其比例\\[5pt]}
-\only<3>{把$c_t$和$\hat{c}_t$组合得到新的记忆$c_{t+1}$\\[5pt]}
-\only<4>{输出门根据$x_t$，$h_t$和$c_{t+1}$得到新的隐藏状态$h_{t+1}$\\[5pt]}
-\only<5>{如此反复，不断更新$c$和$h$直到不再有新的$x$输入\\[5pt]}
-{\scriptsize\begin{tabular}{rl}
-    *$x_t$:&上一层的输出\\
-    *$h_t$:&同一层上一时刻的隐藏状态\\
-    *$c_t$:&同一层上一时刻的记忆
-\end{tabular}}
+\begin{itemize}
+\item LSTM是最常用的循环单元结构，它一种典型的记忆网络，通过``门''单元来动态地选择遗忘多少以前的信息
+\end{itemize}
 %%% 图
 \begin{center}
    \begin{tikzpicture}
@@ -1576,13 +1576,18 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
        \end{scope}
    \end{tikzpicture}
 \end{center}
+
+{\scriptsize\begin{tabular}{l}
+    *$x_t$: 上一层的输出，$h_t$: 同一层上一时刻的隐藏状态\\
+    *$c_t$: 同一层上一时刻的记忆
+\end{tabular}}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
 %%% GRU
-\begin{frame}{改进 - 门循环单元(GRU)}
+\begin{frame}{另一种循环单元 - 门循环单元(GRU)}
 \begin{itemize}
-\item GRU
+\item GRU是LSTM的一个变种，它把隐藏状态$h$和记忆$c$合并成一个隐藏状态$h$，同时使用了更少的``门''单元，大大提升了计算效率
 \end{itemize}
 %%% 图
 \begin{center}
@@ -1730,6 +1735,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
        \end{scope}
    \end{tikzpicture}
 \end{center}
+
+{\scriptsize\begin{tabular}{l}
+    *$x_t$: 上一层的输出\\
+    *$h_t$: 同一层上一时刻的隐藏状态
+\end{tabular}}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
@@ -2653,6 +2663,50 @@ $\textrm{``you''} = \argmax_{y} \textrm{P}(y|\textbf{s}_1, \alert{\textbf{C}})$ 
 %%%------------------------------------------------------------------------------------------------------------
 %%% 训练
 \begin{frame}{训练}
+    \begin{itemize}
+        \item 有了一个NMT模型，我们应该怎么使用梯度下降算法来训练一个``聪明''的翻译模型呢？
+        \begin{itemize}
+            \item 参数初始化
+            \only<2>{：模型结构是确定了，但是我们初始化参数还有很多需要注意的地方，否则训练不了一个优秀的模型
+                \begin{itemize}
+                    \item LSTM遗忘门偏置初始为1，也就是始终选择遗忘记忆$c$，可以有效防止初始时$c$里包含的错误信号传播后面所有时刻
+                    \item 其他参数一般使用Xavier参数初始化方法，可以有效稳定训练过程，特别是对于比较``深''的网络
+                \end{itemize}
+            }
+            \item 优化器选择
+            \only<3-5>{：训练RNN我们通常会使用Adam或者SGD两种优化器，它们各有优劣
+                \begin{center}
+                    \footnotesize
+                    \begin{tabular}{c|c|c}
+                        & 使用 & 性能 \\
+                        \hline
+                        Adam & 一套配置包打天下 & 不算差，但没到极限 \\
+                        SGD & 换一个任务就得调 & 效果杠杠的 \\
+                    \end{tabular}
+                \end{center}
+            }
+            \item 学习率调度
+            \only<4>{
+                \begin{itemize}
+                    \item 不同优化器需要的学习率不同，比如Adam一般使用$0.001$或$0.0001$，而SGD则在$0.1\sim 1$之间挑选
+                    \item 但是无论使用哪个优化器，为了达到最好效果，我们通常都需要根据当前的更新次数来调整学习率的大小
+                \end{itemize}
+            }
+            \only<5>{
+                \begin{itemize}
+                    \item 学习率预热
+                    \item 学习率衰减
+                \end{itemize}
+            }
+            \item 多设备并行
+            \only<6->{
+                \begin{itemize}
+                    \item 万事俱备，只是为什么训练这么慢？\only<7->{\alert{- RNN需要等前面所有时刻都完成计算以后才能开始计算当前时刻的输出}}
+                    \item 我有钱，是不是多买几台设备会更快？\only<7->{\alert{- 可以，但是需要技巧，而且也不是无限增长的}}
+                \end{itemize}
+            }
+        \end{itemize}
+    \end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
@@ -4844,7 +4898,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \begin{itemize}
 \item \textbf{优化器}：使用Adam优化器，$\beta_1$=0.9，$\beta_2$=0.98，$\epsilon=10^{-9}$ 关于学习率的设置，引入了warmup策略，在训练初期，学习率从一个较小的初始值逐渐增大，当到达一定的步数，学习率再逐渐减小
    \begin{displaymath}
-    lrate=d_{model}^{-0.5}\cdot min(step^{-0.5},step\cdot warmup\_steps^{-1.5})
+    lrate=d_{\mathrm{model}}^{-0.5}\cdot \min(step^{-0.5},step\cdot \mathrm{warmup\_steps}^{-1.5})
    \end{displaymath}
    这样做可以减缓在训练初期的不稳定现象，保持分布平稳，通常warmup\_steps通常设置为4000

@@ -5029,7 +5083,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \item 需要考虑的问题：
    \begin{itemize}
    \item 古文短，现代文长，过翻译或者欠翻译对性能影响很大，如何对长度进行更精确的建模
-    \item 不同时期、问题的文字差异性很大，如何进行自动适应和风格迁移
+    \item 不同时代语言差异性大，如何进行自动适应和风格迁移
    \end{itemize}

 \end{itemize}
@@ -5044,7 +5098,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \begin{frame}{NMT应用}
 \begin{itemize}
 \item 古文翻译实例
-\vspace{0.5em}
+\vspace{0.0em}

 \begin{tcolorbox}[size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black]
 {
@@ -5087,7 +5141,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    \begin{itemize}
    \item 除了古文翻译，对联也可以用机器翻译系统生成，只需将输入输出变为对联的上联和下联
    \end{itemize}
-\vspace{-0.5em}
+\vspace{-0.8em}

 \begin{center}
 \begin{tikzpicture}
@@ -5152,16 +5206,16 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \node [lnode,anchor=west] (l1) at (0,0) {上联：翠竹千支歌盛世};
 \node [rnode,anchor=west] (l2) at ([xshift=1em]l1.east) {下联：红梅万点报新春};

-\node [lnode,anchor=north] (l3) at ([yshift=-1em]l1.south) {上联：一帆风顺年年好};
+\node [lnode,anchor=north] (l3) at ([yshift=-0.8em]l1.south) {上联：一帆风顺年年好};
 \node [rnode,anchor=west] (l4) at ([xshift=1em]l3.east) {下联：万事如意步步高};

-\node [lnode,anchor=north] (l5) at ([yshift=-1em]l3.south) {上联：佳节迎春春生笑脸};
+\node [lnode,anchor=north] (l5) at ([yshift=-0.8em]l3.south) {上联：佳节迎春春生笑脸};
 \node [rnode,anchor=west] (l6) at ([xshift=1em]l5.east) {下联：新年纳福富华满堂};

-\node [lnode,anchor=north] (l7) at ([yshift=-1em]l5.south) {上联：腊梅吐芳迎红日};
+\node [lnode,anchor=north] (l7) at ([yshift=-0.8em]l5.south) {上联：腊梅吐芳迎红日};
 \node [rnode,anchor=west] (l8) at ([xshift=1em]l7.east) {下联：绿柳展枝舞春风};

-\node [lnode,anchor=north] (l9) at ([yshift=-1em]l7.south) {上联：雪兆丰年丛岭翠};
+\node [lnode,anchor=north] (l9) at ([yshift=-0.8em]l7.south) {上联：雪兆丰年丛岭翠};
 \node [rnode,anchor=west] (l10) at ([xshift=1em]l9.east) {下联：春回大地满园红};

 \end{scope}
@@ -5183,7 +5237,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    \begin{itemize}
    \item 还可以用机器翻译系统来写诗。如藏头诗，给定诗句的第一个字，生成一首完整的诗。还可以根据意境生成诗句
    \end{itemize}
-\vspace{0.3em}
+\vspace{0.0em}

 \begin{center}
 \begin{tikzpicture}
@@ -5220,22 +5274,77 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\section{Transformer}
+%%% 做个小结
+\begin{frame}{我们赶上了好时代 ...}
+\begin{itemize}
+\item 神经机器翻译的火爆这几年有目共睹，好事情！！！
+    \begin{itemize}
+    \item https://arxiv.org上搜索neural machine translation
+    \item ACL、EMNLP等顶会神经机器翻译论文数量近些年几乎呈线性增长
+    \item 神经机器翻译系统在各大比赛中霸榜，开源机器翻译满天飞，大厂秀肌肉，小作坊刷存在感
+    \end{itemize}
+    \vspace{0.3em}
+\item<2-> 这里只介绍了最基本的概念，NMT的内容远不止这些
+    \begin{itemize}
+    \item 各种专题：解码、压缩、先验知识、低资源翻译、无指导方法、篇章级翻译等等等等
+    \item 推荐一个survey，有些基础的可以参考一下，很全面 \\
+    ``Neural Machine Translation: A Review'' by Felix Stahlberg\\
+    \url{https://arxiv.org/abs/1912.02047}
+    \item 如何搭建一个优秀的NMT系统？- 有许多技巧 \\
+             下一章介绍
+    \item 回忆一下第一章介绍的NMT开源系统，可以试试
+    \end{itemize}
+\end{itemize}
+\end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{多头自注意力模型}
+%%% open source NMT
+\begin{frame}{一些开源NMT系统}
+\end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-\subsection{训练}
+%%% last slide
+\begin{frame}{结束~}

-%%%------------------------------------------------------------------------------------------------------------
-\subsection{推断}
+\vspace{2em}

-%%%------------------------------------------------------------------------------------------------------------
-\subsection{深层网络}
+\begin{center}
+
+\begin{tikzpicture}
+
+\begin{scope}
+\small{
+\node [anchor=south west,minimum width=15em] (source) at (0,0) {\Large{\textbf{source}: 谢谢 大家 ！}};
+\node [anchor=south west,minimum width=15em] (target) at ([yshift=12em]source.north west) {\Large{\textbf{target}: Thank You !}};
+\node [anchor=center,minimum width=9.6em,minimum height=1.8em,draw,rounded corners=0.3em] (hidden) at ([yshift=6em]source.north) {};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!20] (cell01) at ([xshift=0.2em]hidden.west) {\footnotesize{.2}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell02) at (cell01.east) {\footnotesize{-1}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!70] (cell03) at (cell02.east) {\footnotesize{6}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!50] (cell04) at (cell03.east) {\footnotesize{5}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!30] (cell05) at (cell04.east) {\footnotesize{.7}};
+\node [anchor=west,minimum width=1.5em,minimum size=1.5em,fill=ugreen!10] (cell06) at (cell05.east) {\footnotesize{-2}};
+
+\filldraw [fill=red!20,draw=white] (source.north west) -- (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
+\filldraw [fill=blue!20,draw=white] (target.south west) -- (target.south east) -- ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- ([xshift=0.2em,yshift=0.1em]hidden.north west);
+
+\draw [->,thick] (source.north west) -- ([xshift=0.2em,yshift=-0.1em]hidden.south west);
+\draw [->,thick] (source.north east) -- ([xshift=-0.2em,yshift=-0.1em]hidden.south east);
+\draw [->,thick] ([xshift=0.2em,yshift=0.1em]hidden.north west) -- (target.south west);
+\draw [->,thick] ([xshift=-0.2em,yshift=0.1em]hidden.north east) -- (target.south east);
+}
+
+\node [anchor=south] (enclabel) at ([yshift=2em]source.north) {\large{Encoder}};
+\node [anchor=north] (declabel) at ([yshift=-2em]target.south) {\large{Decoder}};
+\end{scope}
+
+\end{tikzpicture}
+
+\vspace{2em}
+
+\end{center}
+
+\end{frame}

-%%%------------------------------------------------------------------------------------------------------------
-\section{其它应用}

 \end{CJK}
 \end{document}