Commit de3158ef by 姜雨帆

Update slides

parent 9c16b856
...@@ -936,7 +936,7 @@ ...@@ -936,7 +936,7 @@
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
\item<2-> 显然上图中的短语并不是语言学上的短语。这里有:\\ \item<2-> 显然上图中的短语并\alert{不是}语言学上的短语。这里有:\\
\vspace{0.3em} \vspace{0.3em}
\begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{定义 - 短语} \begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{定义 - 短语}
...@@ -1068,6 +1068,32 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导,$\textrm{P ...@@ -1068,6 +1068,32 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导,$\textrm{P
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 数学模型
\begin{frame}{数学模型(续)}
\begin{itemize}
\item 但是,上式提到的翻译推导的样本空间是巨大的,很难枚举所有推导并进行求和。通常使用采样的方法选取搜索空间的一部分样本代表整个搜索空间
\vspace{0.3em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (s1) at (0,0) {$\textrm{P}(\textbf{t}|\textbf{s}) = $};
\node [anchor=west,inner sep=3pt,fill=red!20] (s2) at ([xshift=0.1em]s1.east) {$\sum_{d} \textrm{P}(d,\textbf{t}|\textbf{s})$};
\node [anchor=west,inner sep=3pt,fill=green!20,minimum width=7.5em] (s3) at ([xshift=2.5em,yshift=1.5em]s2.east) {$\textrm{Max}\ \textrm{P}(d,\textbf{t}|\textbf{s})$};
\node [anchor=west,inner sep=3pt,fill=green!20,minimum width=7.5em] (s4) at ([xshift=2.5em,yshift=-1.5em]s2.east) {$\sum_{d_{nbest}} \textrm{P}(d,\textbf{t}|\textbf{s})$};
\draw[->,thick] ([xshift=-0.1em]s3.west) -- ([xshift=0.1em,yshift=0.3em]s2.east);
\draw[->,thick] ([xshift=-0.1em]s4.west) -- ([xshift=0.1em,yshift=-0.3em]s2.east);
\end{tikzpicture}
\end{center}
\vspace{0.3em}
如1-best(Viterbi)或者n-best的和来近似所有的和
\item<2-> 若采用Viterbi的方法,机器翻译也可看作对于输入的源语言句子$\textbf{s}$,找到最佳翻译推导$\hat{d}$
\begin{displaymath}
\hat{d} = \argmax_{d} \textrm{P}(d,\textbf{t}|\textbf{s})
\end{displaymath}
在后面的内容中出现的 $\hat{d}$$\hat{t}$ 都可以看作是等价的
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 翻译推导的建模 %%% 翻译推导的建模
\begin{frame}{对翻译推导进行建模} \begin{frame}{对翻译推导进行建模}
\vspace{-0.4em} \vspace{-0.4em}
...@@ -2062,7 +2088,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\ ...@@ -2062,7 +2088,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
\item 参考前一个短语,来判断当前短语是否需要进行,调序距离设为$\textrm{start}_i-\textrm{end}_{i-1}-1$ \item 参考前一个短语,来判断当前短语是否需要进行,调序距离设为$\textrm{start}_i-\textrm{end}_{i-1}-1$
\begin{itemize} \begin{itemize}
\item $\textrm{start}_i$是指翻译成第$i$个目标语短语的源语短语中的第一个词,而$\textrm{end}_i$为源于短语最后一个词 \item $\textrm{start}_i$是指翻译成第$i$个目标语短语的源语短语中的第一个词,而$\textrm{end}_i$为源于短语最后一个词$\textrm{end}_0$为0)
\end{itemize} \end{itemize}
\vspace{0.0em} \vspace{0.0em}
\visible<2->{ \visible<2->{
...@@ -2087,18 +2113,21 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\ ...@@ -2087,18 +2113,21 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
\path[<->, thick] (s1.south) edge (t2.north); \path[<->, thick] (s1.south) edge (t2.north);
\path[<->, thick] (s2.south) edge (t1.north); \path[<->, thick] (s2.south) edge (t1.north);
\node[anchor=west] (target) at ([xshift=3em,yshift=-1em]n5.east) {\scriptsize{目标短语}}; \node[anchor=west] (target) at ([xshift=3em,yshift=2.3em]n5.east) {\scriptsize{目标短语}};
\node[anchor=west] (source) at ([xshift=0.7em]target.east) {\scriptsize{源短语}}; \node[anchor=west] (source) at ([xshift=0.7em]target.east) {\scriptsize{源短语}};
\node[anchor=west] (distance) at ([xshift=0.7em]source.east) {\scriptsize{距离}}; \node[anchor=west] (distance) at ([xshift=0.7em]source.east) {\scriptsize{距离}};
\node[anchor=north] (t1) at ([yshift=-0.1em]target.south) {1}; \node[anchor=north] (t1) at ([yshift=-0.1em]target.south) {1};
\node[anchor=north] (t2) at ([yshift=0.1em]t1.south) {2}; \node[anchor=north] (t2) at ([yshift=-1.8em]t1.south) {2};
\node[anchor=north] (so1) at ([yshift=-0.1em]source.south) {5}; \node[anchor=north] (so1) at ([yshift=-0.1em]source.south) {5};
\node[anchor=north] (so2) at ([yshift=0.1em]so1.south) {1-4}; \node[anchor=north] (so2) at ([yshift=-1.8em]so1.south) {1-4};
\node[anchor=north] (d1) at ([yshift=-0.1em]distance.south) {+4}; \node[anchor=north] (d1) at ([yshift=-0.1em]distance.south) {+4};
\node[anchor=north] (d2) at ([yshift=0.1em]d1.south) {-5}; \node[anchor=north] (d2) at ([yshift=-1.8em]d1.south) {-5};
\node[anchor=north west,fill=red!20] (m1) at ([xshift=-1em,yshift=-0.0em]t1.south west) {\scriptsize{$\textrm{start}_1-\textrm{end}_{0}-1$ = 5 - 0 - 1}};
\node[anchor=north west,fill=red!20] (m2) at ([xshift=-1em,yshift=-0.0em]t2.south west) {\scriptsize{$\textrm{start}_2-\textrm{end}_{1}-1$ = 1 - 5 - 1}};
\draw[-] (target.south west)--([xshift=1.6in]target.south west); \draw[-] (target.south west)--([xshift=1.6in]target.south west);
...@@ -2108,21 +2137,22 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\ ...@@ -2108,21 +2137,22 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
\draw[-,thick] (s2.north west)--([yshift=0.3in]s2.north west); \draw[-,thick] (s2.north west)--([yshift=0.3in]s2.north west);
\draw[->,densely dotted,thick] ([yshift=0.3in]s2.north west)--([xshift=-0.3in,yshift=0.3in]s2.north west); \draw[->,densely dotted,thick] ([yshift=0.3in]s2.north west)--([xshift=-0.3in,yshift=0.3in]s2.north west);
\node[anchor=south] (ld1) at ([xshift=-0.1em,yshift=0.4em]n1.north) {\scriptsize{d=-5}}; \node[anchor=south] (ld1) at ([xshift=-0.1em,yshift=0.4em]n1.north) {\scriptsize{x=-5}};
\node[anchor=south] (ld2) at ([xshift=6em,yshift=0.4em]n1.north) {\scriptsize{d=+4}}; \node[anchor=south] (ld2) at ([xshift=6em,yshift=0.4em]n1.north) {\scriptsize{x=+4}};
\end{scope} \end{scope}
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
} }
\vspace{0.3em} \vspace{0.3em}
\item<3-> 代价函数选择指数衰减函数$d(x)=\alpha^{|x|}$,其中$\alpha$通过近似估计得到,$\alpha \in$[0,1] \item<3-> 代价函数选择指数衰减函数$c(x)=\alpha^{|x|}$,其中$\alpha$通过近似估计得到,$\alpha \in$[0,1]
\begin{itemize} \begin{itemize}
\item<3-> 调序距离越大,调序代价越大 \item<3-> 调序距离越大,调序代价越大
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 调序模型2:MSD模型 %%% 调序模型2:MSD模型
\begin{frame}{调序模型2:MSD模型} \begin{frame}{调序模型2:MSD模型}
...@@ -2252,7 +2282,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\ ...@@ -2252,7 +2282,7 @@ $X$ & $\to$ & $\bar{s},$ & $\bar{t}$ & (R3)\\
\item 来详细的分析一下 \item 来详细的分析一下
\vspace{-1em} \vspace{-1em}
\begin{displaymath} \begin{displaymath}
\textrm{P}(\textbf{o}|\textbf{s},\textbf{t},\textbf{a}) = \prod_{i=1}^{K} \textrm{P}(o_i| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i) \Pr(\textbf{o}|\textbf{s},\textbf{t},\textbf{a}) = \prod_{i=1}^{K} \Pr(o_i| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
\end{displaymath} \end{displaymath}
\vspace{-1em} \vspace{-1em}
\begin{itemize} \begin{itemize}
...@@ -2274,11 +2304,11 @@ o_i = \left\{ \begin{array}{ll} ...@@ -2274,11 +2304,11 @@ o_i = \left\{ \begin{array}{ll}
\end{itemize} \end{itemize}
\vspace{0.1em} \vspace{0.1em}
\begin{displaymath} \begin{displaymath}
f_{M-pre}(d) = \prod_{i=1}^{K} \textrm{P}(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i) f_{\textrm{M-pre}}(d) = \prod_{i=1}^{K} \Pr(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{i-1}, a_i)
\end{displaymath} \end{displaymath}
\vspace{-0.8em} \vspace{-0.8em}
\begin{itemize} \begin{itemize}
\item 我们还可以得到$f_{S-pre}(d)$$f_{D-pre}(d)$,此外将$a_{i-1}$换成$a_{i+1}$,还可以得到每个短语与后面短语的调序类型 \item 我们还可以得到$f_{\textrm{S-pre}}(d)$$f_{\textrm{D-pre}}(d)$,此外将$a_{i-1}$换成$a_{i+1}$,还可以得到每个短语与后面短语的调序类型
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
...@@ -2472,13 +2502,9 @@ f_{M-pre}(d) = \prod_{i=1}^{K} \textrm{P}(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{ ...@@ -2472,13 +2502,9 @@ f_{M-pre}(d) = \prod_{i=1}^{K} \textrm{P}(o_i = M| \bar{s}_{a_i}, \bar{t}_i, a_{
\begin{itemize} \begin{itemize}
\item 但是,对于非连续的调序该模型无法处理 \item 但是,对于非连续的调序该模型无法处理
\end{itemize} \end{itemize}
% \begin{itemize}
% \item 我们还可以得到$f_{S-pre}(d)$和$f_{D-pre}(d)$,此外将$a_{i-1}$换成$a_{i+1}$,还可以得到每个短语与后面短语的调序类型
% \end{itemize}
\item 对于每一种翻译推导$d$,基于最大熵的调序模型的得分计算公式如下 \item 对于每一种翻译推导$d$,基于最大熵的调序模型的得分计算公式如下
\begin{displaymath} \begin{displaymath}
f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) f_{\textrm{ME}}(d) = \prod_{<o,X_1,X_2> \in d} \Pr(o|X_1, X_2)
\end{displaymath} \end{displaymath}
\end{itemize} \end{itemize}
\end{frame} \end{frame}
...@@ -2501,7 +2527,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) ...@@ -2501,7 +2527,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
\item 把每个子模型当作一个特征,为每个模型添加一个权重,然后使用对数线性模型对这些子模型进行建模,对数线性模型的形式如下: \item 把每个子模型当作一个特征,为每个模型添加一个权重,然后使用对数线性模型对这些子模型进行建模,对数线性模型的形式如下:
\vspace{-0.8em} \vspace{-0.8em}
\begin{displaymath} \begin{displaymath}
\textrm{P}(d,\textbf{t}|\textbf{s}) = \exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t})) \textrm{P}(d,\textbf{t}|\textbf{s}) \propto \exp(\sum_{i=1}^{M} \lambda_i \cdot h_i(d,\textbf{s},\textbf{t}))
\end{displaymath} \end{displaymath}
\vspace{-1.2em} \vspace{-1.2em}
\begin{itemize} \begin{itemize}
...@@ -2509,7 +2535,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) ...@@ -2509,7 +2535,7 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
\end{itemize} \end{itemize}
\vspace{0.8em} \vspace{0.8em}
\begin{displaymath} \begin{displaymath}
\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} \textrm{P}(\bar{t}|\bar{s})^{\lambda_{1}} \times f(d)^{\lambda_{2}} \times \Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}} \textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} \Pr(\bar{t}|\bar{s})^{\lambda_{1}} \times f(d)^{\lambda_{2}} \times \Pr\nolimits_{\textrm{lm}}(\mathbf{t})^{\lambda_{lm}}
\end{displaymath} \end{displaymath}
\item 可以引入更多的特征来提高翻译质量(下面介绍) \item 可以引入更多的特征来提高翻译质量(下面介绍)
\end{itemize} \end{itemize}
...@@ -2520,8 +2546,8 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) ...@@ -2520,8 +2546,8 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
\begin{frame}{特征} \begin{frame}{特征}
% 给出特征列表 % 给出特征列表
\begin{itemize} \begin{itemize}
\item \textbf{特征1-2: 短语翻译概率},即正向翻译概率$\textrm{P}(\bar{s}|\bar{t})$和反向翻译概率$\textrm{P}(\bar{t}|\bar{s})$。是基于短语的统计机器翻译模型中最主要的特征。 \item \textbf{特征1-2: 短语翻译概率},即正向翻译概率$\Pr(\bar{s}|\bar{t})$和反向翻译概率$\Pr(\bar{t}|\bar{s})$。是基于短语的统计机器翻译模型中最主要的特征。
\item \textbf{特征3-4: 词汇翻译概率},即正向词汇翻译概率$\Pr_{lex}(\bar{t}|\bar{s})$和反向词汇翻译概率$\Pr_{lex}(\bar{s}|\bar{t})$。用来描述短语对中源语端单词和目标语端单词的对应关系 \item \textbf{特征3-4: 词汇翻译概率},即正向词汇翻译概率$\Pr_{\textrm{lex}}(\bar{t}|\bar{s})$和反向词汇翻译概率$\Pr_{\textrm{lex}}(\bar{s}|\bar{t})$。用来描述短语对中源语端单词和目标语端单词的对应关系
\item<2-> \textbf{特征5: $n$-gram语言模型},即$\textrm{P}_{\textrm{lm}}(\textbf{t})$。度量译文的流畅度,可以使用大规模目标语单语数据得到。 \item<2-> \textbf{特征5: $n$-gram语言模型},即$\textrm{P}_{\textrm{lm}}(\textbf{t})$。度量译文的流畅度,可以使用大规模目标语单语数据得到。
\item<2-> \textbf{特征6:译文长度},即$|\textbf{t}|$。避免模型倾向于短译文,同时让系统自动学习对译文长度的偏好。 \item<2-> \textbf{特征6:译文长度},即$|\textbf{t}|$。避免模型倾向于短译文,同时让系统自动学习对译文长度的偏好。
\item<2-> \textbf{特征7:翻译规则数量}。这个特征是为了避免模型仅仅使用少量特征构成翻译推导(因为翻译概率相乘,因子少结果一般会大一些),同时让系统自动学习对使用规则数量的偏好。 \item<2-> \textbf{特征7:翻译规则数量}。这个特征是为了避免模型仅仅使用少量特征构成翻译推导(因为翻译概率相乘,因子少结果一般会大一些),同时让系统自动学习对使用规则数量的偏好。
...@@ -2535,23 +2561,19 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) ...@@ -2535,23 +2561,19 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
\begin{itemize} \begin{itemize}
\item \textbf{特征8:源语言被翻译为空的单词数量}。注意,空翻译规则(或特征)有时也被称作evil feature,这类特征在一些数据集上对BLEU有很好的提升作用,但是会造成人工评价的下降,因此需要谨慎使用。 \item \textbf{特征8:源语言被翻译为空的单词数量}。注意,空翻译规则(或特征)有时也被称作evil feature,这类特征在一些数据集上对BLEU有很好的提升作用,但是会造成人工评价的下降,因此需要谨慎使用。
\item \textbf{特征9:基于最大熵的调序模型}$f_{ME}(d)$ \item \textbf{特征9:基于最大熵的调序模型}$f_{ME}(d)$
\item \textbf{特征10:基于MSD的调序模型},包括与前一个短语的调序$f_{M-pre}(d)$$f_{S-pre}(d)$$f_{D-pre}(d)$, 和后一个短语的调序$f_{M-fol}(d)$$f_{S-fol}(d)$$f_{D-fol}(d)$ \item \textbf{特征10:基于MSD的调序模型},包括与前一个短语的调序$f_{\textrm{M-pre}}(d)$$f_{\textrm{S-pre}}(d)$$f_{\textrm{D-pre}}(d)$, 和后一个短语的调序$f_{\textrm{M-fol}}(d)$$f_{\textrm{S-fol}}(d)$$f_{\textrm{D-fol}}(d)$
\item \textbf{最终模型得分} \item \textbf{最终模型得分}
\vspace{0.3em} \vspace{0.3em}
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\begin{scope}[minimum height = 15pt] \begin{scope}[minimum height = 15pt]
\node[anchor=west,minimum width=3em] (x1) at (0, 0) {\footnotesize{$\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{ME}(d)^{\lambda_{ME}} \times f_{MSD}(d)^{\lambda_{MSD}} \times$}}; \node[anchor=west,minimum width=3em] (x1) at (0, 0) {\footnotesize{$\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{\textrm{ME}}(d)^{\lambda_{ME}} \times f_{\textrm{MSD}}(d)^{\lambda_{MSD}} \times$}};
\node[anchor=north west] (x2) at ([xshift=4em,yshift=0.1em]x1.south west) {\footnotesize{$\Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})$}}; \node[anchor=north west] (x2) at ([xshift=4em,yshift=0.1em]x1.south west) {\footnotesize{$\Pr\nolimits_{\textrm{lm}}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})$}};
\node[anchor=north west] (x3) at ([yshift=-1.8em]x1.south west) {\footnotesize{$score(\bar{s},\bar{t}) = \textrm{P}(\bar{t}|\bar{s})^{\lambda_{1}} \times \textrm{P}(\bar{s}|\bar{t})^{\lambda_{2}} \times \Pr\nolimits_{lex}(\bar{t}|\bar{s})^{\lambda_{3}} \times \Pr\nolimits_{lex}(\bar{s}|\bar{t})^{\lambda_{4}} \times$}}; \node[anchor=north west] (x3) at ([yshift=-1.8em]x1.south west) {\footnotesize{$score(\bar{s},\bar{t}) = \Pr(\bar{t}|\bar{s})^{\lambda_{1}} \times \Pr(\bar{s}|\bar{t})^{\lambda_{2}} \times \Pr\nolimits_{\textrm{lex}}(\bar{t}|\bar{s})^{\lambda_{3}} \times \Pr\nolimits_{\textrm{lex}}(\bar{s}|\bar{t})^{\lambda_{4}} \times$}};
\node[anchor=north west] (x4) at ([xshift=5em,yshift=0.1em]x3.south west) {\footnotesize{$\exp(\lambda_{PB}) \times \exp(\lambda_{WDB} \cdot \delta(\bar{s} \to null))$}}; \node[anchor=north west] (x4) at ([xshift=5em,yshift=0.1em]x3.south west) {\footnotesize{$\exp(\lambda_{PB}) \times \exp(\lambda_{WDB} \cdot \delta(\bar{s} \to null))$}};
\end{scope} \end{scope}
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
%\begin{displaymath}
%\textrm{P}(d,\textbf{t}|\textbf{s}) = \prod_{(\bar{s},\bar{t}) \in d} score(\bar{s},\bar{t}) \times f_{ME}(d)^{\lambda_{ME}} \times f_{MSD}(d)^{\lambda_{MSD}} \times \nonumber \\
%%\Pr\nolimits_{lm}(\mathbf{t})^{\lambda_{lm}} \times \exp(\lambda_{TWB} \cdot length(\mathbf{t})) / Z(\mathbf{s})
%\end{displaymath}
\end{itemize} \end{itemize}
\end{frame} \end{frame}
...@@ -2563,24 +2585,24 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2) ...@@ -2563,24 +2585,24 @@ f_{ME}(d) = \prod_{<o,X_1,X_2> \in d} \textrm{P}(o|X_1, X_2)
\item 对于训练样本$S=\{(f_1,r_1),...,(f_s,r_s)\}$,有 \item 对于训练样本$S=\{(f_1,r_1),...,(f_s,r_s)\}$,有
\begin{itemize} \begin{itemize}
\item $f_s$为样本中的第$s$个源语句子,$r_s$为相应的译文,通常使用$R=\{r_1,...r_s\}$来表示训练样本的参考译文 \item $f_s$为样本中的第$s$个源语句子,$r_s$为相应的译文,通常使用$R=\{r_1,...r_s\}$来表示训练样本的参考译文
\item 针对每个源语句子,解码器可以生成一个n-best结果$\{t_{ij}\}$ \item 针对每个源语句子,解码器可以生成一个n-best结果$\{d_{ij}\}$
\end{itemize} \end{itemize}
\item 对于模型参数$\lambda$,最佳的翻译结果为$T^*=\{t_{1}^{*},..,t_{i}^{*}\}$ \item 对于模型参数$\lambda$,最佳的翻译推导为$D^*=\{d_{1}^{*},..,d_{i}^{*}\}$
\vspace{-0.5em} \vspace{-0.5em}
\begin{displaymath} \begin{displaymath}
t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) d_{i}^{*} = \argmin_{d_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(d_{ij})
\end{displaymath} \end{displaymath}
\vspace{-0.9em} \vspace{-0.9em}
\item<2-> 最小错误率训练(MERT) \item<2-> 最小错误率训练(MERT)
\vspace{0.1em} \vspace{0.1em}
\begin{itemize} \begin{itemize}
\item<2-> 定义一个错误函数Err$(T^*, R)$来衡量译文$T^*$与参考答案$R$之间的差距,通过调整权重$\lambda$来最小化错误率 \item<2-> 定义一个错误函数Err$(D^*, R)$来衡量推导$D^*$得到的译文与参考答案$R$之间的差距,通过调整权重$\lambda$来最小化错误率
\item<2-> 常见的错误函数有词错误率(WER)、位置错误率(PER)、BLEU值以及NIST值 \item<2-> 常见的错误函数有词错误率(WER)、位置错误率(PER)、BLEU值以及NIST值
\end{itemize} \end{itemize}
\vspace{0.3em} \vspace{0.3em}
\visible<2->{ \visible<2->{
\begin{displaymath} \begin{displaymath}
\mathbf{\lambda}^* = \argmin_{\mathbf{\lambda}} \mathbf{Err}(T^*, R) \mathbf{\lambda}^* = \argmin_{\mathbf{\lambda}} \mathbf{Err}(D^*, R)
\end{displaymath} \end{displaymath}
} }
\end{itemize} \end{itemize}
...@@ -2594,11 +2616,11 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) ...@@ -2594,11 +2616,11 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
\item 如何得到最优的$\lambda^*$ \item 如何得到最优的$\lambda^*$
\begin{itemize} \begin{itemize}
\item 最简单的方法是枚举所有可能的$\lambda$值,但是这样做效率很低。可以只考虑最优译文发生变化的点:) \item 最简单的方法是枚举所有可能的$\lambda$值,但是这样做效率很低。可以只考虑最优译文发生变化的点:)
\item 对于每个训练样本,假设有2-best个译文$\mathbf{t}=\{t_1,t_2\}$,每个译文$t$的得分modelscore($t$)可以表示成关于权重$\lambda_i$的函数 \item 对于每个训练样本,假设有2-best个推导$\mathbf{d}=\{d_1,d_2\}$,每个推导$d$的得分modelscore($d$)可以表示成关于权重$\lambda_i$的函数
\end{itemize} \end{itemize}
\vspace{0.2em} \vspace{0.2em}
\begin{displaymath} \begin{displaymath}
\textrm{modelscore}(t) = \lambda_i \cdot h_i(t) + \sum_{k{\ne}i}^{M} \lambda_k \cdot h_k(t) = a \cdot \lambda_i + b \textrm{modelscore}(d) = \lambda_i \cdot h_i(d) + \sum_{k{\ne}i}^{M} \lambda_k \cdot h_k(d) = a \cdot \lambda_i + b
\end{displaymath} \end{displaymath}
\vspace{-0.7em} \vspace{-0.7em}
\begin{center} \begin{center}
...@@ -2612,8 +2634,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) ...@@ -2612,8 +2634,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
\visible<1-2>{ \visible<1-2>{
\draw[thick] ([yshift=1em]x0.center) -- ([xshift=8em,yshift=5em]x0.center); \draw[thick] ([yshift=1em]x0.center) -- ([xshift=8em,yshift=5em]x0.center);
\draw[thick] ([yshift=2em]x0.center) -- ([xshift=8em,yshift=4em]x0.center); \draw[thick] ([yshift=2em]x0.center) -- ([xshift=8em,yshift=4em]x0.center);
\node[anchor=north] (e1) at ([xshift=6em,yshift=6em]x0.south) {\footnotesize{$t_1$}}; \node[anchor=north] (e1) at ([xshift=6em,yshift=6em]x0.south) {\footnotesize{$d_1$}};
\node[anchor=north] (e2) at ([xshift=7em,yshift=4em]x0.south) {\footnotesize{$t_2$}}; \node[anchor=north] (e2) at ([xshift=7em,yshift=4em]x0.south) {\footnotesize{$d_2$}};
\node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{model score}}; \node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{model score}};
} }
...@@ -2631,8 +2653,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) ...@@ -2631,8 +2653,8 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
\draw[thick,dotted] ([xshift=4em]x0.center) -- ([xshift=4em,yshift=5.5em]x0.center); \draw[thick,dotted] ([xshift=4em]x0.center) -- ([xshift=4em,yshift=5.5em]x0.center);
\node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\footnotesize{$t^*=t_1$}}; \node[anchor=north] (e1) at ([xshift=2em,yshift=3em]x0.north) {\footnotesize{$d^*=d_1$}};
\node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\footnotesize{$t^*=t_2$}}; \node[anchor=north] (e2) at ([xshift=6.2em,yshift=5em]x0.north) {\footnotesize{$d^*=d_2$}};
\node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{BLEU}}; \node[anchor=north,rotate=90] (e2) at ([xshift=-1.3em,yshift=3.6em]x0.south) {\footnotesize{BLEU}};
\draw[decorate,decoration={brace,amplitude=0.4em},red,thick] ([xshift=3.8em,yshift=0.5em]x0.south) -- ([xshift=8.2em,yshift=0.5em]x0.south); \draw[decorate,decoration={brace,amplitude=0.4em},red,thick] ([xshift=3.8em,yshift=0.5em]x0.south) -- ([xshift=8.2em,yshift=0.5em]x0.south);
...@@ -2666,6 +2688,7 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) ...@@ -2666,6 +2688,7 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
\end{itemize} \end{itemize}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\subsection{栈解码} \subsection{栈解码}
...@@ -2674,14 +2697,14 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij}) ...@@ -2674,14 +2697,14 @@ t_{i}^{*} = \argmin_{t_{ij}} \sum_{k=1}^{M} \lambda_k \cdot h_k(t_{ij})
\begin{frame}{解码问题} \begin{frame}{解码问题}
% 定义解码是啥 % 定义解码是啥
\begin{itemize} \begin{itemize}
\item 解码是根据模型以及输入原文,找到得分最高的译文 ${d}^*$ \item 解码是根据模型以及输入原文,找到得分最高的推导 ${d}^*$
\begin{displaymath} \begin{displaymath}
\mathbf{d}^* = \argmax_{\mathbf{t}} \sum_{d \in D(\mathbf{s}, \mathbf{t})} \textrm{P}(\mathbf{t}, d|\mathbf{s}) d^* = \argmax_{d} \sum_{d \in D(\mathbf{s}, \mathbf{t})} \Pr(d, \mathbf{t}|\mathbf{s})
\end{displaymath} \end{displaymath}
\vspace{-0.8em} \vspace{-0.8em}
\begin{itemize} \begin{itemize}
\item 其中 $D$表示所有可能的推导构成的搜索空间。 \item 其中 $D$表示所有可能的推导构成的搜索空间。
\item $\textrm{P}(\mathbf{t}, d|\mathbf{s})$表示前面提到的所有特征的得分 \item $\Pr(d, \mathbf{t}|\mathbf{s})$表示前面提到的所有特征的得分
\end{itemize} \end{itemize}
\item 实际解码过程中,通常按从左到右的顺序生成译文,递增的计算翻译概率,同时对已翻译的原文进行标记 \item 实际解码过程中,通常按从左到右的顺序生成译文,递增的计算翻译概率,同时对已翻译的原文进行标记
\vspace{1em} \vspace{1em}
...@@ -3728,6 +3751,148 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4 ...@@ -3728,6 +3751,148 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% CYK解码 %%% CYK解码
\begin{frame}{CYK算法}
% 看NiuTrans Manual
\begin{itemize}
\item 我们来看一个CYK算法的具体例子,给定一个上下无关文法以及一个单词\alert{aabbc},来判断该单词是否属于此文法,解析流程如下
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.45em,text=white,inner sep=0.1pt]
\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
\tikzstyle{srcnode} = [anchor=south west]
\begin{scope}[scale=0.85]
\node[srcnode] (c1) at (0,0) {\small{a}};
\node[srcnode] (c2) at ([xshift=1em]c1.south east) {\small{a}};
\node[srcnode] (c3) at ([xshift=1em]c2.south east) {\small{b}};
\node[srcnode] (c4) at ([xshift=1em]c3.south east) {\small{b}};
\node[srcnode] (c5) at ([xshift=1em]c4.south east) {\small{c}};
\node[anchor=south east] (g1) at ([xshift=1em,yshift=3.0em]c1.north west) {\small{$\textrm{S} \to \textrm{AB}\ \ \ \textrm{A} \to \textrm{CD}\ \vert \ \textrm{CF}\ \ \ \textrm{B} \to \textrm{c}\ \vert \ \textrm{BE}$}};
\node[anchor=north west] (g2) at ([yshift=0.1em]g1.south west) {\small{$\textrm{C} \to \textrm{a}\ \ \ \ \textrm{D} \to \textrm{b}\ \ \ \ \textrm{E} \to \textrm{c}\ \ \ \ \textrm{F} \to \textrm{AD}$}};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (g1) (g2)] (gl1) {};
\end{pgfonlayer}
\node [anchor=center,alignmentnode] (alig11) at ([yshift=-2em]c1.south) {};
\node [anchor=center,alignmentnode] (alig21) at ([yshift=-2em]alig11.center) {};
\node [anchor=center,alignmentnode] (alig31) at ([yshift=-2em]alig21.center) {};
\node [anchor=center,alignmentnode] (alig41) at ([yshift=-2em]alig31.center) {};
\node [anchor=center,alignmentnode] (alig51) at ([yshift=-2em]alig41.center) {};
\node [anchor=center,alignmentnode] (alig12) at ([yshift=-2em]c2.south) {};
\node [anchor=center,alignmentnode] (alig22) at ([yshift=-2em]alig12.center) {};
\node [anchor=center,alignmentnode] (alig32) at ([yshift=-2em]alig22.center) {};
\node [anchor=center,alignmentnode] (alig42) at ([yshift=-2em]alig32.center) {};
\node [anchor=center,alignmentnode] (alig13) at ([yshift=-2em]c3.south) {};
\node [anchor=center,alignmentnode] (alig23) at ([yshift=-2em]alig13.center) {};
\node [anchor=center,alignmentnode] (alig33) at ([yshift=-2em]alig23.center) {};
\node [anchor=center,alignmentnode] (alig14) at ([yshift=-2em]c4.south) {};
\node [anchor=center,alignmentnode] (alig24) at ([yshift=-2em]alig14.center) {};
\node [anchor=center,alignmentnode] (alig15) at ([yshift=-2em]c5.south) {};
\node [anchor=center] (x1) at ([yshift=1.2em]alig11.center) {\tiny{1}};
\node [anchor=center] (x1) at ([yshift=1.2em]alig12.center) {\tiny{2}};
\node [anchor=center] (x1) at ([yshift=1.2em]alig13.center) {\tiny{3}};
\node [anchor=center] (x1) at ([yshift=1.2em]alig14.center) {\tiny{4}};
\node [anchor=center] (x1) at ([yshift=1.2em]alig15.center) {\tiny{5}};
\node [anchor=center] (y1) at ([xshift=-1.2em]alig11.center) {\tiny{1}};
\node [anchor=center] (y1) at ([xshift=-1.2em]alig21.center) {\tiny{2}};
\node [anchor=center] (y1) at ([xshift=-1.2em]alig31.center) {\tiny{3}};
\node [anchor=center] (y1) at ([xshift=-1.2em]alig41.center) {\tiny{4}};
\node [anchor=center] (y1) at ([xshift=-1.2em]alig51.center) {\tiny{5}};
\node[anchor=west] (l1) at ([xshift=-19.2em,yshift=2em]alig11.west) {\small{1.首先建立一个5*5的上三角矩阵}};
\node[anchor=west] (l2) at ([xshift=0.8em,yshift=-1.5em]l1.west) {\small{cell[i][j]代表范围内所有语法成分}};
\visible<2->{
\node[anchor=west] (l3) at ([yshift=-4em]l1.west) {\small{2.从叶子到根找到所有可能的推导}};
\node[anchor=west] (l4) at ([xshift=0.8em,yshift=-1.5em]l3.west) {\small{如cell[1][3] = cell[1][1] + cell[2][3]}};
\node[anchor=west] (l5) at ([xshift=5.45em,yshift=-1.5em]l4.west) {\small{或cell[1][2] + cell[3][3]}};
}
\visible<10->{
\node[anchor=west] (l6) at ([yshift=-5em]l3.west) {\small{3.如果最后一个cell包含了起始符}};
\node[anchor=west] (l7) at ([xshift=0.8em,yshift=-1.5em]l6.west) {\small{则该单词可以由文法推导得到}};
}
\visible<2>{
\node [anchor=center,selectnode,fill=black!10] (alig11) at (alig11.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=red!30] (c1) at (c1.center) {\small{a}};
}
\visible<3->{
\node [anchor=center,selectnode,fill=blue!30] (alig11) at (alig11.center) {\footnotesize{C}};
}
\visible<3>{
\node [anchor=center,selectnode,fill=black!10] (alig12) at (alig12.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=red!30] (c2) at (c2.center) {\small{a}};
}
\visible<4->{
\node [anchor=center,selectnode,fill=blue!30] (alig12) at (alig12.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=blue!30] (alig13) at (alig13.center) {\footnotesize{D}};
\node [anchor=center,selectnode,fill=blue!30] (alig14) at (alig14.center) {\footnotesize{D}};
\node [anchor=center,selectnode,fill=blue!30] (alig15) at (alig15.center) {\footnotesize{B,E}};
}
\visible<5>{
\node [anchor=center,selectnode,fill=red!30] (alig11) at (alig11.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=red!30] (alig12) at (alig12.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=black!10] (alig21) at (alig21.center) {\footnotesize{}};
}
\visible<6->{
\node [anchor=center,selectnode,fill=blue!30] (alig21) at (alig21.center) {\footnotesize{}};
}
\visible<6>{
\node [anchor=center,selectnode,fill=red!30] (alig13) at (alig13.center) {\footnotesize{D}};
\node [anchor=center,selectnode,fill=red!30] (alig12) at (alig12.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=black!10] (alig22) at (alig22.center) {\footnotesize{A}};
}
\visible<7->{
\node [anchor=center,selectnode,fill=blue!30] (alig22) at (alig22.center) {\footnotesize{A}};
\node [anchor=center,selectnode,fill=blue!30] (alig23) at (alig23.center) {\footnotesize{}};
\node [anchor=center,selectnode,fill=blue!30] (alig24) at (alig24.center) {\footnotesize{}};
}
\visible<8>{
\node [anchor=center,selectnode,fill=red!30] (alig11) at (alig11.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=red!30] (alig22) at (alig22.center) {\footnotesize{A}};
\node [anchor=center,selectnode,fill=black!10] (alig31) at (alig31.center) {\footnotesize{}};
}
\visible<9>{
\node [anchor=center,selectnode,fill=red!30] (alig21) at (alig21.center) {\footnotesize{C}};
\node [anchor=center,selectnode,fill=red!30] (alig13) at (alig13.center) {\footnotesize{D}};
\node [anchor=center,selectnode,fill=black!10] (alig31) at (alig31.center) {\footnotesize{}};
}
\visible<10->{
\node [anchor=center,selectnode,fill=blue!30] (alig31) at (alig31.center) {\footnotesize{}};
\node [anchor=center,selectnode,fill=blue!30] (alig32) at (alig32.center) {\footnotesize{F}};
\node [anchor=center,selectnode,fill=blue!30] (alig33) at (alig33.center) {\footnotesize{}};
\node [anchor=center,selectnode,fill=blue!30] (alig41) at (alig41.center) {\footnotesize{A}};
\node [anchor=center,selectnode,fill=blue!30] (alig42) at (alig42.center) {\footnotesize{}};
\node [anchor=center,selectnode,fill=blue!30] (alig51) at (alig51.center) {\footnotesize{S}};
}
\end{scope}
\end{tikzpicture}
\end{center}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% CYK解码
\begin{frame}{CYK解码(续)} \begin{frame}{CYK解码(续)}
% 看NiuTrans Manual % 看NiuTrans Manual
\begin{itemize} \begin{itemize}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论