create section 7 tex files

8f648718 · xiaotong · df79f520 · 8f648718 · 8f648718
Commit 8f648718 authored Mar 04, 2020 by xiaotong
--- a/.gitignore
+++ b/.gitignore
@@ -23,3 +23,4 @@ Section04-Phrasal-and-Syntactic-Models/section04.pdf
 Section04-Phrasal-and-Syntactic-Models/section04-test.pdf
 Book/mt-book.pdf
 Section07-Making-A-Strong-NMT-System/section07.pdf
+Section07-Towards-Strong-NMT-Systems/section07.pdf
--- a/Section07-Making-A-Strong-NMT-System/section07.tex
+++ b/Section07-Making-A-Strong-NMT-System/section07.tex
@@ -134,41 +134,176 @@
 \section{神经机器翻译并不简单}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 影响神经机器翻译性能的因素
-\begin{frame}{神经机器翻译并不简单}
-\end{frame}
+%%% 如何入门？
+\begin{frame}{小白入门（神经）机器翻译}
+\begin{itemize}
+\item 一个小白入门神经机器翻译的经历

-%%%------------------------------------------------------------------------------------------------------------
-%%% 影响神经机器翻译性能的因素
-\begin{frame}{数据、模型}
+\begin{center}
+\begin{tikzpicture}
+
+\draw [very thick,->] (-2em,0) -- (21em,0);
+
+\visible<2->{
+\node [anchor=south] (xiaobai) at (-2em,0.3em) {\footnotesize{\textbf{小白}}};
+\node [anchor=south] (xiaobaibai) at (21em,0.3em) {\footnotesize{\textbf{小白白}}};
+}
+
+\node [circle,fill=ublue,minimum size=0.4em,inner sep=0] (point1) at (1em,0) {};
+\node [anchor=north,text width=6em,align=left] (box1) at ([yshift=-1em]point1.south) {\scriptsize{我喜欢NLP和MT，尝试下}};
+
+\node [circle,fill=ublue,minimum size=0.4em,inner sep=0] (point2) at (6em,0) {};
+\node [anchor=south,text width=6em,align=left] (box2) at ([yshift=1em]point2.north) {\scriptsize{上网看视频，找博客，阅读一些书籍}};
+
+\node [circle,fill=ublue,minimum size=0.4em,inner sep=0] (point3) at (11em,0) {};
+\node [anchor=north,text width=6em,align=left] (box3) at ([yshift=-1em]point3.south) {\scriptsize{学各种深度学习开源框架，阅读开源NMT系统代码}};
+
+\node [circle,fill=ublue,minimum size=0.4em,inner sep=0] (point4) at (16em,0) {};
+\node [anchor=south,text width=8em,align=left] (box4) at ([yshift=1em]point4.north) {\scriptsize{找一些数据，跑一下系统，有结果！耶！}};
+
+\end{tikzpicture}
+\end{center}
+
+\item<3-> 小白白还想成为大白，甚至大白白、大大白，还是使用同样的套路。\visible<4->{\alert{但是}，只是变得更白了，离大白还很远!}
+
+\begin{center}
+\begin{tikzpicture}
+
+\draw [very thick,->] (-2em,0) -- (21em,0);
+
+\visible<4->{
+\node [anchor=south] (xiaobai) at (-2em,0.3em) {\footnotesize{\textbf{小白白}}};
+\node [anchor=south] (xiaobaibai) at (21em,0.3em) {\footnotesize{\textbf{小白白白}}};
+}
+
+\node [anchor=north,align=center] (box1) at (9em,-0.5em) {\footnotesize{(博客、书、实验)$\times 10$}};
+
+\end{tikzpicture}
+\end{center}
+
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 搭建神经机器翻译系统的流程
-\begin{frame}{NMT搭建流程}
+%%% 为什么成为机器翻译高手还很难
+\begin{frame}{为什么小白没成为高手？}
+
+\begin{itemize}
+
+\item 实力表
+
+\begin{center}
+\begin{tikzpicture}
+
+\draw [very thick,->] (0em,0) -- (20em,0);
+
+\node [anchor=south,minimum width=2em,minimum height=0.6em,fill=red!20] (player1) at (2em,0.05em) {};
+\node [anchor=north] (label1) at (player1.south) {\scriptsize{小白}};
+
+\node [anchor=south,minimum width=2em,minimum height=0.9em,fill=red!20] (player2) at (5em,0.05em) {};
+\node [anchor=north] (label2) at (player2.south) {\scriptsize{小白白}};
+
+\node [anchor=south,minimum width=2em,minimum height=1.0em,fill=red!20] (player3) at (8em,0.05em) {};
+\node [anchor=north] (label3) at (player3.south) {\scriptsize{小白白白}};
+
+\visible<2->{
+\node [anchor=south,minimum width=2em,minimum height=3em,fill=blue!20] (player4) at (12em,0.05em) {};
+\node [anchor=north] (label4) at (player4.south) {\scriptsize{大白}};
+}
+
+\visible<3->{
+\node [anchor=south,minimum width=2em,minimum height=6em,fill=green!20] (player5) at (16em,0.05em) {};
+\node [anchor=north] (label5) at (player5.south) {\scriptsize{超级赛亚人}};
+}
+
+\end{tikzpicture}
+\end{center}
+
+\item<4-> 搞了半天，还是个入门选手，当遇到很多问题时还是无从下手
+    \begin{itemize}
+    \item 论文写不出来，没有idea
+    \item 实验一弄就不好使，性能就是不涨
+    \item 自己搭的系统还很挫，离谷歌翻译还很遥远
+    \end{itemize}
+\item<5-> 原因很简单：我们使用的系统还很初级，离state-of-the-art有距离，导致视野也会受限。\\
+- 怎么办？\alert{参加一次比赛吧}
+
+\end{itemize}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 架构选择
-\begin{frame}{RNN、Transformer}
+%%% 影响机器翻译系统性能的因素
+\begin{frame}{Towards State-of-the-Art}
+
+\begin{itemize}
+\item 注意，机器翻译终究是面向应用的研究领域。因此，研发高性能的系统是我们追求的目标。一般来说，影响机器翻译系统性能有几个因素
+    \begin{enumerate}
+    \item \textbf{数据}：大规模、高质量的数据是前提
+    \item \textbf{技术}：算法和模型要足够先进
+    \item \textbf{打磨}：需要对各个模块进行细致的打磨，说白了，下功夫不到有数据和技术也是白搭
+    \end{enumerate}
+\vspace{0.5em}
+\item<2-> 参加一次机器翻译的评测比赛是实现上述目标的一个很好的方法，比如，WMT News Translation Track
+    \begin{enumerate}
+    \item 评测提供数据，规模和质量都能保证，因此可以在相对公平的数据基础上进行技术对比
+    \item 评测系统都会有技术报告，因此可以相对容易的复现以前的结果，而且很容易了解最新的动态
+    \item 评测本身就是驱动力（大家都希望取得好成绩），因此会不断打磨细节
+    \end{enumerate}
+\end{itemize}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%% 本章的核心问题
-\begin{frame}{核心问题}
+%%% 参加比赛的流程
+\begin{frame}{基本流程}
+
+\begin{itemize}
+\item 机器翻译比赛现在已经非常友好了（如WMT、CCMT等），有些比赛甚至都不需要提前报名，注册个账户就可以刷榜
+    \begin{enumerate}
+    \item 从官方网站了解比赛计划
+    \item 发放训练数据
+    \item 数据初级及加工
+    \item 选型，搭建第一个初级版本
+    \item 和自己较劲（如果能够对比以前的参赛系统就更好了），改进系统，尝试各种技术
+    \item 发放测试数据，运行系统提交最终结果
+    \item 公布结果
+    \item 撰写评测报告
+    \item 参加评测研讨会，学习交流
+    \end{enumerate}
+    
+\item<2-> 当然，这里并不是要描述整个机器翻译的流程。我们重点还是关注有哪些技术可以使神经机器翻译变得更强，为相关研究提供\alert{合理的基线系统}
+\end{itemize}

-\vspace{6em}
+\end{frame}

- \begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
-{\LARGE
-\textbf{如何搭建一个可以进行实战的}\\
-\\
-\textbf{神经机器翻译系统？}
-}
-\end{tcolorbox}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 选型
+\begin{frame}{必要的准备 - 选型}
+
+\begin{itemize}
+\item 这里我们以WMT、CCMT的汉英新闻翻译任务为例，介绍如何搭建一套性能更加强劲的神经机器翻译系统
+    \begin{itemize}
+    \item 翻译品质为评价指标（如BLEU），翻译速度等暂不考虑
+    \item 假设设备是充分的
+    \item 假设开发系统和模型训练时间也是充分的
+    \end{itemize}
+\item<2-> 面临的一个问题是选择什么样的系统架构。当然，这个问题可以边做表调整，不过这里我们选择Transformer作为基本框架（见第六章）。因为，
+    \begin{itemize}
+    \item Transformer已经被证明是当今性能最好的NMT模型之一
+    \item 在WMT2019和CCMT2019的评测中，Transformer已经成为了各个参赛队伍的标配
+    \item Transformer是很多benchmark上的优胜者
+    \end{itemize}
+\item<3-> 当然，后面讨论的内容绝大多数与神经机器翻译架构无关，这些可以被推广到其它类型的系统上
+    \begin{itemize}
+    \item \textbf{基础技术}：确保可以得到一个不太差的系统
+    \item \textbf{进阶技术}：接近甚至达到State-of-the-art
+    \end{itemize}
+\end{itemize}

 \end{frame}

+
 %%%------------------------------------------------------------------------------------------------------------
 %%% 本章的内容
 \begin{frame}{Outline}