Update slides

de3158ef · 姜雨帆 · 9c16b856 · de3158ef
Commit de3158ef authored Feb 16, 2020 by 姜雨帆
--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -936,7 +936,7 @@
 \end{tikzpicture}
 \end{center}
-\item<2-> 显然上图中的短语并不是语言学上的短语。这里有：\\
+\item<2-> 显然上图中的短语并\alert{不是}语言学上的短语。这里有：\\
 \vspace{0.3em}
 \begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{定义 - 短语}
@@ -1068,6 +1068,32 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
+%%% 数学模型
+\begin{frame}{数学模型（续）}
+\begin{itemize}
+\item 但是，上式提到的翻译推导的样本空间是巨大的，很难枚举所有推导并进行求和。通常使用采样的方法选取搜索空间的一部分样本代表整个搜索空间
+\vspace{0.3em}
+\begin{center}
+\begin{tikzpicture}
+\node [anchor=west] (s1) at (0,0) {$\textrm{P}(\textbf{t}|\textbf{s}) = $};
+\node [anchor=west,inner sep=3pt,fill=red!20] (s2) at ([xshift=0.1em]s1.east) {$\sum_{d} \textrm{P}(d,\textbf{t}|\textbf{s})$};
+\node [anchor=west,inner sep=3pt,fill=green!20,minimum width=7.5em] (s3) at ([xshift=2.5em,yshift=1.5em]s2.east) {$\textrm{Max}\  \textrm{P}(d,\textbf{t}|\textbf{s})$};
+\node [anchor=west,inner sep=3pt,fill=green!20,minimum width=7.5em] (s4) at ([xshift=2.5em,yshift=-1.5em]s2.east) {$\sum_{d_{nbest}} \textrm{P}(d,\textbf{t}|\textbf{s})$};
+\draw[->,thick] ([xshift=-0.1em]s3.west) -- ([xshift=0.1em,yshift=0.3em]s2.east);
+\draw[->,thick] ([xshift=-0.1em]s4.west) -- ([xshift=0.1em,yshift=-0.3em]s2.east);
+\end{tikzpicture}
+\end{center}
+\vspace{0.3em}
+如1-best（Viterbi）或者n-best的和来近似所有的和
+\item<2-> 若采用Viterbi的方法，机器翻译也可看作对于输入的源语言句子$\textbf{s}$，找到最佳翻译推导$\hat{d}$
+\begin{displaymath}
+\hat{d} = \argmax_{d} \textrm{P}(d,\textbf{t}|\textbf{s})
+\end{displaymath}
+在后面的内容中出现的 $\hat{d}$ 和 $\hat{t}$ 都可以看作是等价的
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
 %%% 翻译推导的建模
 \begin{frame}{对翻译推导进行建模}
 \vspace{-0.4em}
@@ -2062,7 +2088,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
 \item 参考前一个短语，来判断当前短语是否需要进行，调序距离设为$\textrm{start}_i-\textrm{end}_{i-1}-1$
 \begin{itemize}
-\item $\textrm{start}_i$是指翻译成第$i$个目标语短语的源语短语中的第一个词，而$\textrm{end}_i$为源于短语最后一个词
+\item $\textrm{start}_i$是指翻译成第$i$个目标语短语的源语短语中的第一个词，而$\textrm{end}_i$为源于短语最后一个词（$\textrm{end}_0$为0）
 \end{itemize}
 \vspace{0.0em}
 \visible<2->{
@@ -2087,18 +2113,21 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
 \path[<->, thick] (s1.south) edge (t2.north);
 \path[<->, thick] (s2.south) edge (t1.north);
-\node[anchor=west] (target) at ([xshift=3em,yshift=-1em]n5.east) {\scriptsize{目标短语}};
+\node[anchor=west] (target) at ([xshift=3em,yshift=2.3em]n5.east) {\scriptsize{目标短语}};
 \node[anchor=west] (source) at ([xshift=0.7em]target.east) {\scriptsize{源短语}};
 \node[anchor=west] (distance) at ([xshift=0.7em]source.east) {\scriptsize{距离}};
 \node[anchor=north] (t1) at ([yshift=-0.1em]target.south) {1};
-\node[anchor=north] (t2) at ([yshift=0.1em]t1.south) {2};
+\node[anchor=north] (t2) at ([yshift=-1.8em]t1.south) {2};
 \node[anchor=north] (so1) at ([yshift=-0.1em]source.south) {5};
-\node[anchor=north] (so2) at ([yshift=0.1em]so1.south) {1-4};
+\node[anchor=north] (so2) at ([yshift=-1.8em]so1.south) {1-4};
 \node[anchor=north] (d1) at ([yshift=-0.1em]distance.south) {+4};
-\node[anchor=north] (d2) at ([yshift=0.1em]d1.south) {-5};
+\node[anchor=north] (d2) at ([yshift=-1.8em]d1.south) {-5};
+\node[anchor=north west,fill=red!20] (m1) at ([xshift=-1em,yshift=-0.0em]t1.south west) {\scriptsize{$\textrm{start}_1-\textrm{end}_{0}-1$ = 5  - 0 - 1}};
+\node[anchor=north west,fill=red!20] (m2) at ([xshift=-1em,yshift=-0.0em]t2.south west) {\scriptsize{$\textrm{start}_2-\textrm{end}_{1}-1$ = 1  - 5 - 1}};
 \draw[-] (target.south west)--([xshift=1.6in]target.south west);
@@ -2108,21 +2137,22 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
 \draw[-,thick] (s2.north west)--([yshift=0.3in]s2.north west);
 \draw[->,densely dotted,thick] ([yshift=0.3in]s2.north west)--([xshift=-0.3in,yshift=0.3in]s2.north west);
-\node[anchor=south] (ld1) at ([xshift=-0.1em,yshift=0.4em]n1.north) {\scriptsize{d=-5}};
+\node[anchor=south] (ld1) at ([xshift=-0.1em,yshift=0.4em]n1.north) {\scriptsize{x=-5}};
-\node[anchor=south] (ld2) at ([xshift=6em,yshift=0.4em]n1.north) {\scriptsize{d=+4}};
+\node[anchor=south] (ld2) at ([xshift=6em,yshift=0.4em]n1.north) {\scriptsize{x=+4}};
 \end{scope}
 \end{tikzpicture}
 \end{center}
 }
 \vspace{0.3em}
-\item<3-> 代价函数选择指数衰减函数$d(x)=\alpha^{|x|}$，其中$\alpha$通过近似估计得到，$\alpha \in$[0,1]
+\item<3-> 代价函数选择指数衰减函数$c(x)=\alpha^{|x|}$，其中$\alpha$通过近似估计得到，$\alpha \in$[0,1]
 \begin{itemize}
 \item<3-> 调序距离越大，调序代价越大
 \end{itemize}
 \end{itemize}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
 %%% 调序模型2：MSD模型
 \begin{frame}{调序模型2：MSD模型}
@@ -2252,7 +2282,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
 \item 来详细的分析一下
 \vspace{-1em}
 \begin{displaymath}
-\textrm{P}(\textbf{o}|\textbf{s},\textbf{t},\textbf{a}) = \prod_{i=1}^{K} \textrm{P}(o_i| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
+\Pr(\textbf{o}|\textbf{s},\textbf{t},\textbf{a}) = \prod_{i=1}^{K} \Pr(o_i| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
 \end{displaymath}
 \vspace{-1em}
    \begin{itemize}
@@ -2274,11 +2304,11 @@ o_i = \left\{ \begin{array}{ll}
 	\end{itemize}
 \vspace{0.1em}
 \begin{displaymath}
-f_{M-pre}(d) = \prod_{i=1}^{K} \textrm{P}(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
+f_{\textrm{M-pre}}(d) = \prod_{i=1}^{K} \Pr(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
 \end{displaymath}
 \vspace{-0.8em}
 	\begin{itemize}
-	\item 我们还可以得到$f_{S-pre}(d)$和$f_{D-pre}(d)$，此外将$a_{i-1}$换成$a_{i+1}$，还可以得到每个短语与后面短语的调序类型
+	\item 我们还可以得到$f_{\textrm{S-pre}}(d)$和$f_{\textrm{D-pre}}(d)$，此外将$a_{i-1}$换成$a_{i+1}$，还可以得到每个短语与后面短语的调序类型
 	\end{itemize}
 \end{itemize}
@@ -2472,13 +2502,9 @@ f_{M-pre}(d) = \prod_{i=1}^{K} \textrm{P}(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{
 	\begin{itemize}
 	\item 但是，对于非连续的调序该模型无法处理
 	\end{itemize}
-%	\begin{itemize}
-%	\item 我们还可以得到$f_{S-pre}(d)$和$f_{D-pre}(d)$，此外将$a_{i-1}$换成$a_{i+1}$，还可以得到每个短语与后面短语的调序类型
-%	\end{itemize}
 \item 对于每一种翻译推导$d$，基于最大熵的调序模型的得分计算公式如下
 \begin{displaymath}
-f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
+f_{\textrm{ME}}(d) = \prod_{<o,X_1,X_2> \in d} \Pr(o|X_1, X_2)
 \end{displaymath}
 \end{itemize}
 \end{frame}
@@ -2501,7 +2527,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
 \item 把每个子模型当作一个特征，为每个模型添加一个权重，然后使用对数线性模型对这些子模型进行建模，对数线性模型的形式如下：
 \vspace{-0.8em}
 \begin{displaymath}
-\textrm{P}(d,\textbf{t}|\textbf{s}) = \exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))
+\textrm{P}(d,\textbf{t}|\textbf{s}) \propto \exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))
 \end{displaymath}
 \vspace{-1.2em}
 	\begin{itemize}
@@ -2509,7 +2535,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
 	\end{itemize}
 \vspace{0.8em}
 \begin{displaymath}
-\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} \textrm{P}(\bar{t}|\bar{s})^{\lambda_{1}} \times f(d)^{\lambda_{2}} \times \Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}}
+\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} \Pr(\bar{t}|\bar{s})^{\lambda_{1}} \times f(d)^{\lambda_{2}} \times \Pr\nolimits_{\textrm{lm}}(\mathbf{t})^{\lambda_{lm}}
 \end{displaymath}
 \item 可以引入更多的特征来提高翻译质量（下面介绍）
 \end{itemize}
@@ -2520,8 +2546,8 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
 \begin{frame}{特征}
 % 给出特征列表
 \begin{itemize}
-\item \textbf{特征1-2： 短语翻译概率}，即正向翻译概率$\textrm{P}(\bar{s}|\bar{t})$和反向翻译概率$\textrm{P}(\bar{t}|\bar{s})$。是基于短语的统计机器翻译模型中最主要的特征。
+\item \textbf{特征1-2： 短语翻译概率}，即正向翻译概率$\Pr(\bar{s}|\bar{t})$和反向翻译概率$\Pr(\bar{t}|\bar{s})$。是基于短语的统计机器翻译模型中最主要的特征。
-\item \textbf{特征3-4： 词汇翻译概率}，即正向词汇翻译概率$\Pr_{lex}(\bar{t}|\bar{s})$和反向词汇翻译概率$\Pr_{lex}(\bar{s}|\bar{t})$。用来描述短语对中源语端单词和目标语端单词的对应关系
+\item \textbf{特征3-4： 词汇翻译概率}，即正向词汇翻译概率$\Pr_{\textrm{lex}}(\bar{t}|\bar{s})$和反向词汇翻译概率$\Pr_{\textrm{lex}}(\bar{s}|\bar{t})$。用来描述短语对中源语端单词和目标语端单词的对应关系
 \item<2-> \textbf{特征5： $n$-gram语言模型}，即$\textrm{P}_{\textrm{lm}}(\textbf{t})$。度量译文的流畅度，可以使用大规模目标语单语数据得到。
 \item<2-> \textbf{特征6：译文长度}，即$|\textbf{t}|$。避免模型倾向于短译文，同时让系统自动学习对译文长度的偏好。
 \item<2-> \textbf{特征7：翻译规则数量}。这个特征是为了避免模型仅仅使用少量特征构成翻译推导(因为翻译概率相乘，因子少结果一般会大一些)，同时让系统自动学习对使用规则数量的偏好。
@@ -2535,23 +2561,19 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
 \begin{itemize}
 \item \textbf{特征8：源语言被翻译为空的单词数量}。注意，空翻译规则(或特征)有时也被称作evil feature，这类特征在一些数据集上对BLEU有很好的提升作用，但是会造成人工评价的下降，因此需要谨慎使用。
 \item \textbf{特征9：基于最大熵的调序模型}，$f_{ME}(d)$。
-\item \textbf{特征10：基于MSD的调序模型}，包括与前一个短语的调序$f_{M-pre}(d)$、$f_{S-pre}(d)$、$f_{D-pre}(d)$, 和后一个短语的调序$f_{M-fol}(d)$、$f_{S-fol}(d)$、$f_{D-fol}(d)$
+\item \textbf{特征10：基于MSD的调序模型}，包括与前一个短语的调序$f_{\textrm{M-pre}}(d)$、$f_{\textrm{S-pre}}(d)$、$f_{\textrm{D-pre}}(d)$, 和后一个短语的调序$f_{\textrm{M-fol}}(d)$、$f_{\textrm{S-fol}}(d)$、$f_{\textrm{D-fol}}(d)$
 \item \textbf{最终模型得分}
 \vspace{0.3em}
 \begin{center}
 \begin{tikzpicture}
 \begin{scope}[minimum height = 15pt]
-\node[anchor=west,minimum width=3em] (x1) at (0, 0) {\footnotesize{$\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{ME}(d)^{\lambda_{ME}} \times f_{MSD}(d)^{\lambda_{MSD}} \times$}};
+\node[anchor=west,minimum width=3em] (x1) at (0, 0) {\footnotesize{$\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{\textrm{ME}}(d)^{\lambda_{ME}} \times f_{\textrm{MSD}}(d)^{\lambda_{MSD}} \times$}};
-\node[anchor=north west] (x2) at ([xshift=4em,yshift=0.1em]x1.south west) {\footnotesize{$\Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})$}};
+\node[anchor=north west] (x2) at ([xshift=4em,yshift=0.1em]x1.south west) {\footnotesize{$\Pr\nolimits_{\textrm{lm}}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})$}};
-\node[anchor=north west] (x3) at ([yshift=-1.8em]x1.south west) {\footnotesize{$score(\bar{s},\bar{t}) = \textrm{P}(\bar{t}|\bar{s})^{\lambda_{1}} \times \textrm{P}(\bar{s}|\bar{t})^{\lambda_{2}} \times \Pr\nolimits_{lex}(\bar{t}|\bar{s})^{\lambda_{3}} \times \Pr\nolimits_{lex}(\bar{s}|\bar{t})^{\lambda_{4}} \times$}};
+\node[anchor=north west] (x3) at ([yshift=-1.8em]x1.south west) {\footnotesize{$score(\bar{s},\bar{t}) = \Pr(\bar{t}|\bar{s})^{\lambda_{1}} \times \Pr(\bar{s}|\bar{t})^{\lambda_{2}} \times \Pr\nolimits_{\textrm{lex}}(\bar{t}|\bar{s})^{\lambda_{3}} \times \Pr\nolimits_{\textrm{lex}}(\bar{s}|\bar{t})^{\lambda_{4}} \times$}};
 \node[anchor=north west] (x4) at ([xshift=5em,yshift=0.1em]x3.south west) {\footnotesize{$\exp(\lambda_{PB}) \times \exp(\lambda_{WDB} \cdot \delta(\bar{s} \to null))$}};
 \end{scope}
 \end{tikzpicture}
 \end{center}
-%\begin{displaymath}
-%\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{ME}(d)^{\lambda_{ME}} \times f_{MSD}(d)^{\lambda_{MSD}} \times \nonumber \\
-%%\Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})
-%\end{displaymath}
 \end{itemize}
 \end{frame}
@@ -2563,24 +2585,24 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
 \item 对于训练样本$S=\{(f_1,r_1),...,(f_s,r_s)\}$，有
 	\begin{itemize}
 	\item $f_s$为样本中的第$s$个源语句子，$r_s$为相应的译文，通常使用$R=\{r_1,...r_s\}$来表示训练样本的参考译文
-	\item 针对每个源语句子，解码器可以生成一个n-best结果$\{t_{ij}\}$
+	\item 针对每个源语句子，解码器可以生成一个n-best结果$\{d_{ij}\}$
 	\end{itemize}
-\item 对于模型参数$\lambda$，最佳的翻译结果为$T^*=\{t_{1}^{*},..,t_{i}^{*}\}$
+\item 对于模型参数$\lambda$，最佳的翻译推导为$D^*=\{d_{1}^{*},..,d_{i}^{*}\}$
 \vspace{-0.5em}
 \begin{displaymath}
-t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
+d_{i}^{*} = \argmin_{d_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(d_{ij})
 \end{displaymath}
 \vspace{-0.9em}
 \item<2-> 最小错误率训练（MERT）
 \vspace{0.1em}
 	\begin{itemize}
-	\item<2-> 定义一个错误函数Err$(T^*, R)$来衡量译文$T^*$与参考答案$R$之间的差距，通过调整权重$\lambda$来最小化错误率
+	\item<2-> 定义一个错误函数Err$(D^*, R)$来衡量推导$D^*$得到的译文与参考答案$R$之间的差距，通过调整权重$\lambda$来最小化错误率
 	\item<2-> 常见的错误函数有词错误率（WER）、位置错误率（PER）、BLEU值以及NIST值
 	\end{itemize}
 \vspace{0.3em}
 \visible<2->{
 \begin{displaymath}
-\mathbf{\lambda}^*  =  \argmin_{\mathbf{\lambda}} \mathbf{Err}(T^*, R)
+\mathbf{\lambda}^*  =  \argmin_{\mathbf{\lambda}} \mathbf{Err}(D^*, R)
 \end{displaymath}
 }
 \end{itemize}
@@ -2594,11 +2616,11 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
 \item 如何得到最优的$\lambda^*$
 	\begin{itemize}
 	\item 最简单的方法是枚举所有可能的$\lambda$值，但是这样做效率很低。可以只考虑最优译文发生变化的点:）
-	\item 对于每个训练样本，假设有2-best个译文$\mathbf{t}=\{t_1,t_2\}$，每个译文$t$的得分modelscore($t$)可以表示成关于权重$\lambda_i$的函数
+	\item 对于每个训练样本，假设有2-best个推导$\mathbf{d}=\{d_1,d_2\}$，每个推导$d$的得分modelscore($d$)可以表示成关于权重$\lambda_i$的函数
 	\end{itemize}
 \vspace{0.2em}
 \begin{displaymath}
-\textrm{modelscore}(t) = \lambda_i \cdot h_i(t) + \sum_{k{\ne}i}^{M} \lambda_k \cdot h_k(t) = a \cdot \lambda_i + b
+\textrm{modelscore}(d) = \lambda_i \cdot h_i(d) + \sum_{k{\ne}i}^{M} \lambda_k \cdot h_k(d) = a \cdot \lambda_i + b
 \end{displaymath}
 \vspace{-0.7em}
 \begin{center}
@@ -2612,8 +2634,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
 \visible<1-2>{
 \draw[thick] ([yshift=1em]x0.center) -- ([xshift=8em,yshift=5em]x0.center);
 \draw[thick] ([yshift=2em]x0.center) -- ([xshift=8em,yshift=4em]x0.center);
-\node[anchor=north] (e1) at ([xshift=6em,yshift=6em]x0.south) {\footnotesize{$t_1$}};
+\node[anchor=north] (e1) at ([xshift=6em,yshift=6em]x0.south) {\footnotesize{$d_1$}};
-\node[anchor=north] (e2) at ([xshift=7em,yshift=4em]x0.south) {\footnotesize{$t_2$}};
+\node[anchor=north] (e2) at ([xshift=7em,yshift=4em]x0.south) {\footnotesize{$d_2$}};
 \node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{model score}};
 }
@@ -2631,8 +2653,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
 \draw[thick,dotted] ([xshift=4em]x0.center) -- ([xshift=4em,yshift=5.5em]x0.center);
-\node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\footnotesize{$t^*=t_1$}};
+\node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\footnotesize{$d^*=d_1$}};
-\node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\footnotesize{$t^*=t_2$}};
+\node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\footnotesize{$d^*=d_2$}};
 \node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{BLEU}};
 \draw[decorate,decoration={brace,amplitude=0.4em},red,thick] ([xshift=3.8em,yshift=0.5em]x0.south) -- ([xshift=8.2em,yshift=0.5em]x0.south);
@@ -2666,6 +2688,7 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
 \end{itemize}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
 \subsection{栈解码}
@@ -2674,14 +2697,14 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
 \begin{frame}{解码问题}
 % 定义解码是啥
 \begin{itemize}
-\item 解码是根据模型以及输入原文，找到得分最高的译文 ${d}^*$
+\item 解码是根据模型以及输入原文，找到得分最高的推导 ${d}^*$
 \begin{displaymath}
-\mathbf{d}^*  =  \argmax_{\mathbf{t}} \sum_{d \in D(\mathbf{s}, \mathbf{t})} \textrm{P}(\mathbf{t}, d|\mathbf{s})
+d^*  =  \argmax_{d} \sum_{d \in D(\mathbf{s}, \mathbf{t})} \Pr(d, \mathbf{t}|\mathbf{s})
 \end{displaymath}
 \vspace{-0.8em}
 	\begin{itemize}
 	\item 其中 $D$表示所有可能的推导构成的搜索空间。
-	\item $\textrm{P}(\mathbf{t}, d|\mathbf{s})$表示前面提到的所有特征的得分
+	\item $\Pr(d, \mathbf{t}|\mathbf{s})$表示前面提到的所有特征的得分
 	\end{itemize}
 \item 实际解码过程中，通常按从左到右的顺序生成译文，递增的计算翻译概率，同时对已翻译的原文进行标记
 \vspace{1em}
@@ -3728,6 +3751,148 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 %%%------------------------------------------------------------------------------------------------------------
 %%%  CYK解码
+\begin{frame}{CYK算法}
+% 看NiuTrans Manual
+\begin{itemize}
+\item 我们来看一个CYK算法的具体例子，给定一个上下无关文法以及一个单词\alert{aabbc}，来判断该单词是否属于此文法，解析流程如下
+\vspace{-0.5em}
+\begin{center}
+\begin{tikzpicture}
+\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.45em,text=white,inner sep=0.1pt]
+\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
+\tikzstyle{srcnode} = [anchor=south west]
+\begin{scope}[scale=0.85]
+\node[srcnode] (c1) at (0,0) {\small{a}};
+\node[srcnode] (c2) at ([xshift=1em]c1.south east) {\small{a}};
+\node[srcnode] (c3) at ([xshift=1em]c2.south east) {\small{b}};
+\node[srcnode] (c4) at ([xshift=1em]c3.south east) {\small{b}};
+\node[srcnode] (c5) at ([xshift=1em]c4.south east) {\small{c}};
+\node[anchor=south east] (g1) at ([xshift=1em,yshift=3.0em]c1.north west) {\small{$\textrm{S} \to \textrm{AB}\ \ \ \textrm{A} \to \textrm{CD}\  \vert \  \textrm{CF}\ \ \ \textrm{B} \to \textrm{c}\  \vert \  \textrm{BE}$}};
+\node[anchor=north west] (g2) at ([yshift=0.1em]g1.south west) {\small{$\textrm{C} \to \textrm{a}\ \ \ \  \textrm{D} \to \textrm{b}\ \ \ \ \textrm{E} \to \textrm{c}\ \ \ \ \textrm{F} \to \textrm{AD}$}};
+\begin{pgfonlayer}{background}
+\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (g1) (g2)] (gl1) {};
+\end{pgfonlayer}
+\node [anchor=center,alignmentnode] (alig11) at ([yshift=-2em]c1.south) {};
+\node [anchor=center,alignmentnode] (alig21) at ([yshift=-2em]alig11.center) {};
+\node [anchor=center,alignmentnode] (alig31) at ([yshift=-2em]alig21.center) {};
+\node [anchor=center,alignmentnode] (alig41) at ([yshift=-2em]alig31.center) {};
+\node [anchor=center,alignmentnode] (alig51) at ([yshift=-2em]alig41.center) {};
+\node [anchor=center,alignmentnode] (alig12) at ([yshift=-2em]c2.south) {};
+\node [anchor=center,alignmentnode] (alig22) at ([yshift=-2em]alig12.center) {};
+\node [anchor=center,alignmentnode] (alig32) at ([yshift=-2em]alig22.center) {};
+\node [anchor=center,alignmentnode] (alig42) at ([yshift=-2em]alig32.center) {};
+\node [anchor=center,alignmentnode] (alig13) at ([yshift=-2em]c3.south) {};
+\node [anchor=center,alignmentnode] (alig23) at ([yshift=-2em]alig13.center) {};
+\node [anchor=center,alignmentnode] (alig33) at ([yshift=-2em]alig23.center) {};
+\node [anchor=center,alignmentnode] (alig14) at ([yshift=-2em]c4.south) {};
+\node [anchor=center,alignmentnode] (alig24) at ([yshift=-2em]alig14.center) {};
+\node [anchor=center,alignmentnode] (alig15) at ([yshift=-2em]c5.south) {};
+\node [anchor=center] (x1) at ([yshift=1.2em]alig11.center) {\tiny{1}};
+\node [anchor=center] (x1) at ([yshift=1.2em]alig12.center) {\tiny{2}};
+\node [anchor=center] (x1) at ([yshift=1.2em]alig13.center) {\tiny{3}};
+\node [anchor=center] (x1) at ([yshift=1.2em]alig14.center) {\tiny{4}};
+\node [anchor=center] (x1) at ([yshift=1.2em]alig15.center) {\tiny{5}};
+\node [anchor=center] (y1) at ([xshift=-1.2em]alig11.center) {\tiny{1}};
+\node [anchor=center] (y1) at ([xshift=-1.2em]alig21.center) {\tiny{2}};
+\node [anchor=center] (y1) at ([xshift=-1.2em]alig31.center) {\tiny{3}};
+\node [anchor=center] (y1) at ([xshift=-1.2em]alig41.center) {\tiny{4}};
+\node [anchor=center] (y1) at ([xshift=-1.2em]alig51.center) {\tiny{5}};
+\node[anchor=west] (l1) at ([xshift=-19.2em,yshift=2em]alig11.west) {\small{1.首先建立一个5*5的上三角矩阵}};
+\node[anchor=west] (l2) at ([xshift=0.8em,yshift=-1.5em]l1.west) {\small{cell[i][j]代表范围内所有语法成分}};
+\visible<2->{
+\node[anchor=west] (l3) at ([yshift=-4em]l1.west) {\small{2.从叶子到根找到所有可能的推导}};
+\node[anchor=west] (l4) at ([xshift=0.8em,yshift=-1.5em]l3.west) {\small{如cell[1][3] = cell[1][1] + cell[2][3]}};
+\node[anchor=west] (l5) at ([xshift=5.45em,yshift=-1.5em]l4.west) {\small{或cell[1][2] + cell[3][3]}};
+}
+\visible<10->{
+\node[anchor=west] (l6) at ([yshift=-5em]l3.west) {\small{3.如果最后一个cell包含了起始符}};
+\node[anchor=west] (l7) at ([xshift=0.8em,yshift=-1.5em]l6.west) {\small{则该单词可以由文法推导得到}};
+}
+\visible<2>{
+\node [anchor=center,selectnode,fill=black!10] (alig11) at (alig11.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=red!30] (c1) at (c1.center) {\small{a}};
+}
+\visible<3->{
+\node [anchor=center,selectnode,fill=blue!30] (alig11) at (alig11.center) {\footnotesize{C}};
+}
+\visible<3>{
+\node [anchor=center,selectnode,fill=black!10] (alig12) at (alig12.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=red!30] (c2) at (c2.center) {\small{a}};
+}
+\visible<4->{
+\node [anchor=center,selectnode,fill=blue!30] (alig12) at (alig12.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=blue!30] (alig13) at (alig13.center) {\footnotesize{D}};
+\node [anchor=center,selectnode,fill=blue!30] (alig14) at (alig14.center) {\footnotesize{D}};
+\node [anchor=center,selectnode,fill=blue!30] (alig15) at (alig15.center) {\footnotesize{B,E}};
+}
+\visible<5>{
+\node [anchor=center,selectnode,fill=red!30] (alig11) at (alig11.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=red!30] (alig12) at (alig12.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=black!10] (alig21) at (alig21.center) {\footnotesize{}};
+}
+\visible<6->{
+\node [anchor=center,selectnode,fill=blue!30] (alig21) at (alig21.center) {\footnotesize{}};
+}
+\visible<6>{
+\node [anchor=center,selectnode,fill=red!30] (alig13) at (alig13.center) {\footnotesize{D}};
+\node [anchor=center,selectnode,fill=red!30] (alig12) at (alig12.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=black!10] (alig22) at (alig22.center) {\footnotesize{A}};
+}
+\visible<7->{
+\node [anchor=center,selectnode,fill=blue!30] (alig22) at (alig22.center) {\footnotesize{A}};
+\node [anchor=center,selectnode,fill=blue!30] (alig23) at (alig23.center) {\footnotesize{}};
+\node [anchor=center,selectnode,fill=blue!30] (alig24) at (alig24.center) {\footnotesize{}};
+}
+\visible<8>{
+\node [anchor=center,selectnode,fill=red!30] (alig11) at (alig11.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=red!30] (alig22) at (alig22.center) {\footnotesize{A}};
+\node [anchor=center,selectnode,fill=black!10] (alig31) at (alig31.center) {\footnotesize{}};
+}
+\visible<9>{
+\node [anchor=center,selectnode,fill=red!30] (alig21) at (alig21.center) {\footnotesize{C}};
+\node [anchor=center,selectnode,fill=red!30] (alig13) at (alig13.center) {\footnotesize{D}};
+\node [anchor=center,selectnode,fill=black!10] (alig31) at (alig31.center) {\footnotesize{}};
+}
+\visible<10->{
+\node [anchor=center,selectnode,fill=blue!30] (alig31) at (alig31.center) {\footnotesize{}};
+\node [anchor=center,selectnode,fill=blue!30] (alig32) at (alig32.center) {\footnotesize{F}};
+\node [anchor=center,selectnode,fill=blue!30] (alig33) at (alig33.center) {\footnotesize{}};
+\node [anchor=center,selectnode,fill=blue!30] (alig41) at (alig41.center) {\footnotesize{A}};
+\node [anchor=center,selectnode,fill=blue!30] (alig42) at (alig42.center) {\footnotesize{}};
+\node [anchor=center,selectnode,fill=blue!30] (alig51) at (alig51.center) {\footnotesize{S}};
+}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%%  CYK解码
 \begin{frame}{CYK解码（续）}
 % 看NiuTrans Manual
 \begin{itemize}