Commit b64ef0db by zengxin

合并分支 'zengxin' 到 'caorunzhe'

Zengxin

查看合并请求 !272
parents 822f6186 ac98a9da
......@@ -775,7 +775,7 @@
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -808,18 +808,18 @@
\end{tikzpicture}
\item<3-> \textbf{步骤1}:构建单词翻译表 - 翻译词典\\
\small{对于任意的源语言单词$x$,要获得它所有可能的译文$Y$。给定一个互译句对$(s,t)$,对于$y \in Y$,定义$\textrm{P}(x \leftrightarrow y; s, t)$表示$x$$y$$(x,y)$中互译的概率,我们用$x$$y$的联合概率表示:
\small{对于任意的源语言单词$x$,要获得它所有可能的译文$Y$。给定一个互译句对$(\mathbf{s},\mathbf{t})$,对于$y \in Y$,定义$\textrm{P}(x \leftrightarrow y; \mathbf{s}, \mathbf{t})$表示$x$$y$$(x,y)$中互译的概率,我们用$x$$y$的联合概率表示:
\vspace{-2.0em}
\begin{eqnarray}
\textrm{P}(x \leftrightarrow y; s,t) & \equiv & \textrm{P}(x,y;s,t) \nonumber \\
& = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
\textrm{P}(x \leftrightarrow y; \mathbf{s},\mathbf{t}) & \equiv & \textrm{P}(x,y;\mathbf{s},\mathbf{t}) \nonumber \\
& = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
\end{eqnarray}
\vspace{-0.5em}
$c(x,y;s,t)$表示$(x,y)$$(s,t)$中共现的次数; $\sum_{x',y'} c(x',y';s,t)$表示$(s,t)$中任意源/译文单词共现的总次数
$c(x,y;\mathbf{s},\mathbf{t})$表示$(x,y)$$(\mathbf{s},\mathbf{t})$中共现的次数; $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})$表示$(\mathbf{s},\mathbf{t})$中任意源/译文单词共现的总次数
}
\end{itemize}
......@@ -831,7 +831,7 @@
\begin{frame}{实现一个简单的机器翻译系统:学习单词翻译概率(2)}
\vspace{-1em}
\begin{eqnarray}
\textrm{P}(x,y;s,t) & = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
\textrm{P}(x,y;\mathbf{s},\mathbf{t}) & = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
\end{eqnarray}
\vspace{-0.5em}
......@@ -839,7 +839,7 @@
\begin{flushleft}
\begin{tikzpicture}
\node [anchor=west] (s) at (0,0) {$s=$};
\node [anchor=west] (s) at (0,0) {$\mathbf{s}=$};
\node [anchor=center] (sw1) at ([xshift=1em]s.east) {机器};
\visible<1,4->{
\node [anchor=center] (sw2) at ([xshift=1.3em]sw1.east) {翻译};
......@@ -849,7 +849,7 @@
\node [anchor=center] (sw4) at ([xshift=1.0em]sw3.east) {翻译};
}
\node [anchor=north west] (t) at (s.south west) {$t=$};
\node [anchor=north west] (t) at (s.south west) {$\mathbf{t}=$};
\node [anchor=center] (tw1) at ([xshift=1.8em]t.east) {machine};
\visible<1,3,5->{
\node [anchor=center] (tw2) at ([xshift=2.2em]tw1.east) {translation};
......@@ -880,8 +880,8 @@
\begin{itemize}
\item $c(\textrm{'翻译'},\textrm{'translation'};s,t)=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
\item<6-> $\sum_{x',y'} c(x',y';s,t)= \textrm{使劲数...} = 63\visible<7->{ = 9 \times 7 = |s| \times |t|}$
\item $c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t})=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
\item<6-> $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})= \textrm{使劲数...} = 63\visible<7->{ = 9 \times 7 = |\mathbf{s}| \times |\mathbf{t}|}$
\vspace{0.3em}
\begin{itemize}
\item<7-> $|\cdot|$表示句子长度
......@@ -889,14 +889,14 @@
\vspace{0.3em}
\item<8-> '翻译'和'translation'的互译概率为
\begin{displaymath}
\textrm{P}(\textrm{'翻译'},\textrm{'translation'};s,t) = 4/63
\textrm{P}(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t}) = 4/63
\end{displaymath}
\vspace{-0.5em}
类似的
\vspace{-0.5em}
\begin{eqnarray}
\textrm{P}(\textrm{'机器'},\textrm{'translation'};s,t) & = & 2/63 \nonumber \\
\textrm{P}(\textrm{'机器'},\textrm{'look'};s,t) & = & 0/63 \nonumber
\textrm{P}(\textrm{'机器'},\textrm{'translation'};\mathbf{s},\mathbf{t}) & = & 2/63 \nonumber \\
\textrm{P}(\textrm{'机器'},\textrm{'look'};\mathbf{s},\mathbf{t}) & = & 0/63 \nonumber
\end{eqnarray}
\end{itemize}
......@@ -909,12 +909,12 @@
\begin{itemize}
\item 很多时候,我们有多个互译句对$(s^{[1]},t^{[1]}),...,(s^{[n]},t^{[n]})$,称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为
\item 很多时候,我们有多个互译句对$(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[n]},\mathbf{t}^{[n]})$,称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为
\vspace{-1em}
\begin{eqnarray}
\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;s^{[i]},t^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';s^{[i]},t^{[i]})} \nonumber
\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;\mathbf{s}^{[i]},\mathbf{t}^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';\mathbf{s}^{[i]},\mathbf{t}^{[i]})} \nonumber
\end{eqnarray}
\item<2-> 说白了就是计算$(x,y)$的频次时,在每个句子上累加
......@@ -922,11 +922,11 @@
\begin{flushleft}
\begin{tikzpicture}
\node [anchor=west] (s1) at (0,0) {$s_1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$t_1=$ Machine translation is just translation by computer};
\node [anchor=west] (s1) at (0,0) {$\mathbf{s}^1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$\mathbf{t}^1=$ Machine translation is just translation by computer};
\node [anchor=north west] (s2) at (t1.south west) {$s_2=$ 那 人工 翻译 呢 ?};
\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$t_2=$ So , what is human translation ?};
\node [anchor=north west] (s2) at (t1.south west) {$\mathbf{s}^2=$ 那 人工 翻译 呢 ?};
\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$\mathbf{t}^2=$ So , what is human translation ?};
\end{tikzpicture}
\end{flushleft}
......@@ -936,8 +936,8 @@
{\footnotesize
\begin{eqnarray}
& & \textrm{P}(\textrm{'翻译'},\textrm{'translation'}) \nonumber \\
& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};s^{[1]},t^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};s^{[2]},t^{[2]})}{\sum_{x',y'} c(x',y';s^{[1]},t^{[1]}) + \sum_{x',y'} c(x',y';s^{[2]},t^{[2]})} \nonumber \\
\visible<3->{& = & \frac{4 + 1}{|s^{[1]}| \times |t^{[1]}| + |s^{[2]}| \times |t^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[1]},\mathbf{t}^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[2]},\mathbf{t}^{[2]})}{\sum_{x',y'} c(x',y';\mathbf{s}^{[1]},\mathbf{t}^{[1]}) + \sum_{x',y'} c(x',y';\mathbf{s}^{[2]},\mathbf{t}^{[2]})} \nonumber \\
\visible<3->{& = & \frac{4 + 1}{|\mathbf{s}^{[1]}| \times |\mathbf{t}^{[1]}| + |\mathbf{s}^{[2]}| \times |\mathbf{t}^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
\end{eqnarray}
}
......@@ -964,7 +964,7 @@
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -992,14 +992,14 @@
\end{center}
\begin{itemize}
\item \textbf{步骤2}: 对任意的句对$(s,t)$计算句子级翻译概率$\textrm{P}(t|s)$ \\
\item \textbf{步骤2}: 对任意的句对$(\mathbf{s},\mathbf{t})$计算句子级翻译概率$\textrm{P}(\mathbf{t}|\mathbf{s})$ \\
\vspace{0.5em}
\visible<2->{
用一种比较简单的思路:定义$(s,t)$上的一种分数$g(s,t)$
用一种比较简单的思路:定义$(\mathbf{s},\mathbf{t})$上的一种分数$g(\mathbf{s},\mathbf{t})$
\begin{itemize}
\item $g(s,t)$的值越大翻译质量越好
\item $g(s,t)$的值越小翻译质量越差
\item $g(\mathbf{s},\mathbf{t})$的值越大翻译质量越好
\item $g(\mathbf{s},\mathbf{t})$的值越小翻译质量越差
\end{itemize}
}
......@@ -1009,10 +1009,10 @@
于是,我们进一步定义
\begin{displaymath}
\textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}
\textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}
\end{displaymath}
实际上就是对$g(s,t)$在所有可能的译文集合上做归一化,使其具有概率意义
实际上就是对$g(\mathbf{s},\mathbf{t})$在所有可能的译文集合上做归一化,使其具有概率意义
}
\end{itemize}
......@@ -1030,11 +1030,11 @@
\item 两个问题
\begin{enumerate}
\item 如何计算$g(s,t)$ \visible<2->{- 最关键的建模问题,马上开始}
\item 如何计算$\sum_{t'} g(s,t')$ \visible<2->{- 实际上\alert{不用计算},后面再说}
\item 如何计算$g(\mathbf{s},\mathbf{t})$ \visible<2->{- 最关键的建模问题,马上开始}
\item 如何计算$\sum_{\mathbf{t}'} g(\mathbf{s},\mathbf{t}')$ \visible<2->{- 实际上\alert{不用计算},后面再说}
\end{enumerate}
\item<3-> \textbf{$g(s,t)$建模: }根据本章第一页的假设,$s$$t$之间存在一种单词间的对应,我们称之为\alert{词对齐}关系
\item<3-> \textbf{$g(\mathbf{s},\mathbf{t})$建模: }根据本章第一页的假设,$\mathbf{s}$$\mathbf{t}$之间存在一种单词间的对应,我们称之为\alert{词对齐}关系
\begin{center}
\begin{tikzpicture}
......@@ -1045,7 +1045,7 @@
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-3.0em]
......@@ -1054,7 +1054,7 @@
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
\end{scope}
......@@ -1082,9 +1082,9 @@
\begin{frame}{实现一个简单的机器翻译系统:句子级翻译模型(3)}
\begin{itemize}
\item 给定一个句对$(s,t)$,及它们之间的(最优)词对齐$\hat{A}$,可以定义模型得分为:
\item 给定一个句对$(\mathbf{s},\mathbf{t})$,及它们之间的(最优)词对齐$\hat{A}$,可以定义模型得分为:
\begin{displaymath}
g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
g(\mathbf{s},\mathbf{t}) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\end{displaymath}
显然每个单词翻译概率都高,那么整句的模型得分也高
......@@ -1097,7 +1097,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-3.0em]
......@@ -1106,7 +1106,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
\end{scope}
......@@ -1122,7 +1122,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\vspace{-2.5em}
\begin{eqnarray}
g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
g(\mathbf{s},\mathbf{t}) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
& & \textrm{P}(\textrm{'感到','am'}) \times \textrm{P}(\textrm{'满意','satisfied'}) \nonumber
\end{eqnarray}
......@@ -1141,13 +1141,13 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\begin{frame}{实现一个简单的机器翻译系统:句子级翻译模型(4)}
\begin{itemize}
\item \textbf{但是},这样设计的$g(s,t)$没有考虑词序的信息。相同译词出现在不同的位置,得分相同 - 无法选择流畅的译文
\item \textbf{但是},这样设计的$g(\mathbf{s},\mathbf{t})$没有考虑词序的信息。相同译词出现在不同的位置,得分相同 - 无法选择流畅的译文
\vspace{0.5em}
\begin{tabular}{l | l }
& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(t)$}}} \\ \hline
& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(\mathbf{t})$}}} \\ \hline
\begin{tikzpicture}
......@@ -1160,7 +1160,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-2.6em]
......@@ -1169,7 +1169,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t'=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}'=$};
\end{scope}
......@@ -1197,7 +1197,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-2.6em]
......@@ -1206,7 +1206,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=center] (t3) at ([yshift=-1.7em]s3.south) {you\footnotesize{$_3$}};
\node [anchor=center] (t4) at ([yshift=-1.7em]s4.south) {am\footnotesize{$_4$}};
\node [anchor=center] (t5) at ([yshift=-1.6em]s5.south) {satisfied\footnotesize{$_5$}};
\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$t''=$};
\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$\mathbf{t}''=$};
\end{scope}
......@@ -1225,7 +1225,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\end{tabular}
\item<2-> \textbf{解决方案}:引入语言模型$\textrm{P}_{lm}(t)$来度量译文的流畅度
\item<2-> \textbf{解决方案}:引入语言模型$\textrm{P}_{lm}(\mathbf{t})$来度量译文的流畅度
$\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 | w_1) \times \textrm{P}(w_3 | w_2) ... \times \textrm{P}(w_m | w_{m-1})$
......@@ -1234,7 +1234,7 @@ $\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 |
\vspace{-1em}
\begin{displaymath}
g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(t)
g(\mathbf{s},\mathbf{t})=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(\mathbf{t})
\end{displaymath}
\end{itemize}
......@@ -1260,7 +1260,7 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -1287,16 +1287,16 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\end{center}
\begin{itemize}
\item \textbf{步骤3:解码 - }对任意的$s$,找到翻译概率最大的译文$\hat{t}$
\item \textbf{步骤3:解码 - }对任意的$\mathbf{s}$,找到翻译概率最大的译文$\hat{\mathbf{t}}$
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(t|s)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
\end{displaymath}
这里$\argmax_{a} f(a)$表示找到使$f(a)$达到最大的$a$输出
这里$\argmax_{\mathbf{a}} f(\mathbf{a})$表示找到使$f(\mathbf{a})$达到最大的$\mathbf{a}$输出
\item<2-> 现在我们可以对任意的$(s,t)$计算$\textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}$
\item<2-> 现在我们可以对任意的$(\mathbf{s},\mathbf{t})$计算$\textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}$
\begin{itemize}
\item 给定$s$$\sum_{t'}g(s,t')$是个常数(因为$\sum_{t'}g(s,t')$的变量只有$s$)
\item 给定$\mathbf{s}$$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$是个常数(因为$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$的变量只有$\mathbf{s}$)
\item \textbf{这样,我们得到解码步骤的形式化描述为}
\end{itemize}
......@@ -1304,8 +1304,8 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\vspace{-1em}
\begin{eqnarray}
\hat{t} & = & \argmax_{t} \frac{g(s,t)}{\sum_{t'}g(s,t')} \nonumber \\
& = & \argmax_{t} g(s,t) \nonumber
\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')} \nonumber \\
& = & \argmax_{\mathbf{t}} g(\mathbf{s},\mathbf{t}) \nonumber
\end{eqnarray}
\end{itemize}
......@@ -1317,15 +1317,15 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\begin{frame}{实现一个简单的机器翻译系统:解码(2)}
\vspace{0.5em}
\begin{itemize}
\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(s,t)$达到最大的译文\\
\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(\mathbf{s},\mathbf{t})$达到最大的译文\\
\vspace{-1em}
\begin{minipage}[t]{0.58\linewidth}
\begin{itemize}
\item$s$$m$个词,每个词有$n$个翻译候选 - 共有$n^m$种组合
\item$\mathbf{s}$$m$个词,每个词有$n$个翻译候选 - 共有$n^m$种组合
\vspace{-0.5em}
\item<2-> 词的翻译候选可以任意调序
\vspace{-0.5em}
\item<3-> $s$对应可能的译文至少有$n^m \cdot m!$
\item<3-> $\mathbf{s}$对应可能的译文至少有$n^m \cdot m!$
\end{itemize}
\end{minipage}
\hfill
......@@ -1408,8 +1408,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
{\scriptsize
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-1pt]line2.south west) {\textrm{2: $best = \phi$}};
\node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-1pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
\node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-1pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
......@@ -1421,7 +1421,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\node [anchor=north west,inner sep=2pt,align=left] (line11) at ([yshift=-1pt]line10.south west) {\textrm{10: \textbf{return} $best.translatoin$}};
\node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=1pt]line1.north west) {输出: 找的最佳译文};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};
}
......@@ -1602,8 +1602,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
{\tiny
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-3pt]line2.south west) {\textrm{2: $best = \phi$}};
\node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-3pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
\node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-3pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
......@@ -1620,7 +1620,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
}
\node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=3pt]line1.north west) {输出: 找的最佳译文};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};
}
......@@ -1900,7 +1900,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
}
\visible<6->{
\node [anchor=north west] (glabel) at (hlabel.south west) {$g(s,t)$};
\node [anchor=north west] (glabel) at (hlabel.south west) {$g(\mathbf{s},\mathbf{t})$};
\node [anchor=west] (translabel) at (glabel.east) {翻译结果};
\draw [-] (glabel.north east) -- ([yshift=-1.9in]glabel.north east);
\draw [-] (glabel.south west) -- ([xshift=3.5in]glabel.south west);
......@@ -2206,7 +2206,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{机器翻译的统计建模}
\begin{itemize}
\item \textbf{一个人在做翻译时}:对于给定的源语言句子$s$,可以了翻译为一个(或者若干个)正确的译文$\hat{t}$
\item \textbf{一个人在做翻译时}:对于给定的源语言句子$\mathbf{s}$,可以了翻译为一个(或者若干个)正确的译文$\hat{\mathbf{t}}$
\begin{itemize}
\item 也就是说除了正确的译文,其它的翻译都是不正确的
\end{itemize}
......@@ -2214,20 +2214,20 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{t}$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{\mathbf{t}}$}};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t.north west) node[pos=0.5,below] {\tiny{正确翻译}};
\end{tikzpicture}
\end{center}
\item<2-> \textbf{统计机器翻译的思想是}:对于$s$,所有可能的目标语词串$t$都是可能的译文。每一对($s$,$t$)都有一个概率值$\textrm{P}(t|s)$ 来描述$s$ 翻译为$t$的好与坏
\item<2-> \textbf{统计机器翻译的思想是}:对于$\mathbf{s}$,所有可能的目标语词串$\mathbf{t}$都是可能的译文。每一对($\mathbf{s}$,$\mathbf{t}$)都有一个概率值$\textrm{P}(\mathbf{t}|\mathbf{s})$ 来描述$\mathbf{s}$ 翻译为$\mathbf{t}$的好与坏
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t1) at ([xshift=1in]s.east) {\black{$t_1$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t2) at ([xshift=3em,yshift=2em]t1.north east) {\black{$t_2$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t3) at ([xshift=1em,yshift=4em]t1.north east) {\black{$t_3$}};
......@@ -2237,10 +2237,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t6) at ([xshift=3em]t2.east) {};
\node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t7) at ([xshift=3em]t4.east) {};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|s$)=0.1}};
\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|s$)=0.2}};
\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|s$)=0.3}};
\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|s$)=0.1}};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|\mathbf{s}$)=0.1}};
\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|\mathbf{s}$)=0.2}};
\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|\mathbf{s}$)=0.3}};
\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|\mathbf{s}$)=0.1}};
\end{tikzpicture}
\end{center}
......@@ -2254,13 +2254,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{噪声信道模型}
\begin{itemize}
\item \textbf{噪声信道模型}:源语言句子$s$(信宿)是由目标语句子$t$(信源)经过一个有噪声的信道得到的。如果知道了$s$和信道的性质,我们可以通过$\textrm{P}(t|s)$得到可能的信源的概率。\\
\item \textbf{噪声信道模型}:源语言句子$\mathbf{s}$(信宿)是由目标语句子$\mathbf{t}$(信源)经过一个有噪声的信道得到的。如果知道了$\mathbf{s}$和信道的性质,我们可以通过$\textrm{P}(\mathbf{t}|\mathbf{s})$得到可能的信源的概率。\\
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$t$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$\mathbf{t}$}};
\draw [<->,thick,] (s.east) -- (t.west) node [pos=0.5,draw,fill=white] {噪声信道};
\node [anchor=east] at (s.west) {\scriptsize{信宿}};
......@@ -2272,13 +2272,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
而通过上述过程找到最可能的信源的过程被称之为\alert{解码}
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(t|s)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
\end{displaymath}
\item<2-> \textbf{贝叶斯变换}
\begin{eqnarray}
\textrm{P}(t|s) & = & \frac{\textrm{P}(s,t)}{\textrm{P}(s)} \nonumber \\
& = & \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber
\textrm{P}(\mathbf{t}|\mathbf{s}) & = & \frac{\textrm{P}(\mathbf{s},\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
& = & \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber
\end{eqnarray}
\end{itemize}
......@@ -2291,26 +2291,26 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (p) at (0,0) {$\textrm{P}(t|s)$};
\node [anchor=west] (p) at (0,0) {$\textrm{P}(\mathbf{t}|\mathbf{s})$};
\node [anchor=west] (eqiv) at (p.east) {=};
\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};
\visible<2->{
\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};
}
\draw [-] ([yshift=-4pt]transmodel.south west) -- ([yshift=-4pt]lmmodel.south east);
\visible<2->{
\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$t$,得到信宿$s$ }\\\footnotesize{的概率,称作\textbf{翻译模型}}};
\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$\mathbf{t}$,得到信宿$\mathbf{s}$ }\\\footnotesize{的概率,称作\textbf{翻译模型}}};
\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$t$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};
\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$\mathbf{t}$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};
\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$s$出现的概率}\\\footnotesize{给定$s$$\textrm{P}(s)$\textbf{常量}}};
\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$\mathbf{s}$出现的概率}\\\footnotesize{给定$\mathbf{s}$$\textrm{P}(\mathbf{s})$\textbf{常量}}};
\draw [->,thick] (transmodel.north) .. controls +(north:1.5em) and + (south:1.5em) .. (tmmark.south);
\draw [->,thick] (lmmodel.north) .. controls +(north:1.5em) and + (south:1.3em) .. (lmmark.south);
......@@ -2326,11 +2326,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\vspace{-0.5em}
\begin{eqnarray}
\hat{t} & = & \argmax_{t} \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber \\
& = & \argmax_{t} \textrm{P}(s|t) \textrm{P}(t) \nonumber
\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
& = & \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t}) \nonumber
\end{eqnarray}
即,在所有可能的译文中找到使翻译模型$\textrm{P}(s|t)$和语言模型$\textrm{P}(t)$乘积最大的译文
即,在所有可能的译文中找到使翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$和语言模型$\textrm{P}(\mathbf{t})$乘积最大的译文
\end{itemize}
\
......@@ -2344,28 +2344,28 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{基本问题}
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(s|t) \textrm{P}(t)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})
\end{displaymath}
\begin{itemize}
\item \textbf{三个基本问题}
\begin{enumerate}
\item \textbf{建模}:如何描述计算$\textrm{P}(s|t)$$\textrm{P}(t)$的计算方式
\item \textbf{训练}:如何获得计算$\textrm{P}(s|t)$$\textrm{P}(t)$所需的参数
\item \textbf{建模}:如何描述计算$\textrm{P}(\mathbf{s}|\mathbf{t})$$\textrm{P}(\mathbf{t})$的计算方式
\item \textbf{训练}:如何获得计算$\textrm{P}(\mathbf{s}|\mathbf{t})$$\textrm{P}(\mathbf{t})$所需的参数
\item \textbf{解码}:如何完成搜索最优解的过程$argmax$
\end{enumerate}
\item<2-> 回忆一下本章开始的实例,是不是有似曾相识的感觉?
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (e1) at (0,0) {$g(s,t)$};
\node [anchor=west] (e1) at (0,0) {$g(\mathbf{s},\mathbf{t})$};
\node [anchor=west] (e2) at (e1.east) {$=$};
\node [anchor=west,inner sep=2pt,fill=red!20] (e3) at (e2.east) {$\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$};
\node [anchor=west,inner sep=1pt] (e4) at (e3.east) {$\times$};
\node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(t)$};
\node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(s|t)$};
\node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(\mathbf{t})$};
\node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (n1part2) at ([yshift=0.3em]n1.south) {\scriptsize{\textbf{翻译模型}}};
\node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(t)$};
\node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north] (n2part2) at ([yshift=0.3em]n2.south) {\scriptsize{\textbf{语言模型}}};
\draw [->,thick] (e3.south) .. controls +(south:1em) and +(north:1em) .. (n1.north);
\draw [->,thick] (e5.south) .. controls +(south:1em) and +(70:1em) .. (n2.north);
......@@ -2389,13 +2389,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\label{ibmmodelingstart}
\begin{itemize}
\item \textbf{$\textrm{P}(t)$和解码}在前面的内容中有介绍,下面重点求解$\textrm{P}(s|t)$,即:
\item \textbf{$\textrm{P}(\mathbf{t})$和解码}在前面的内容中有介绍,下面重点求解$\textrm{P}(\mathbf{s}|\mathbf{t})$,即:
\begin{itemize}
\item \textbf{翻译模型建模} - $\textrm{P}(s|t)$的计算方法
\item \textbf{翻译模型参数估计} - 计算$\textrm{P}(s|t)$所需的参数
\item \textbf{翻译模型建模} - $\textrm{P}(\mathbf{s}|\mathbf{t})$的计算方法
\item \textbf{翻译模型参数估计} - 计算$\textrm{P}(\mathbf{s}|\mathbf{t})$所需的参数
\end{itemize}
\vspace{0.5em}
\item<2-> \textbf{IBM模型的假设}$s=s_1...s_m$$t=t_1...t_n$之间有单词一级的对应,称作\alert{单词对齐}或者\alert{词对齐}。此外:
\item<2-> \textbf{IBM模型的假设}$\mathbf{s}=s_1...s_m$$\mathbf{t}=t_1...t_n$之间有单词一级的对应,称作\alert{单词对齐}或者\alert{词对齐}。此外:
\begin{itemize}
\item \textbf{约束}:一个源语言单词只能对应一个目标语单词
\vspace{0.5em}
......@@ -2462,10 +2462,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% modeling P(s|t)
\begin{frame}{建模 - $\textrm{P}(s|t)$}
\begin{frame}{建模 - $\textrm{P}(\mathbf{s}|\mathbf{t})$}
\begin{itemize}
\item 给定$s$$t$,它们之间的\alert{词对齐}被记为$a=a_1...a_m$
\item 给定$\mathbf{s}$$\mathbf{t}$,它们之间的\alert{词对齐}被记为$\mathbf{a}=a_1...a_m$
\begin{itemize}
\item $a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置
\begin{center}
......@@ -2486,15 +2486,15 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\end{tikzpicture}
\end{center}
\end{itemize}
\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(s|t)$被表示为所有可能的词对齐的生成概率\\
\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(\mathbf{s}|\mathbf{t})$被表示为所有可能的词对齐的生成概率\\
\vspace{-0.5em}
\begin{displaymath}
\textrm{P}(s|t) = \sum_{a} \textrm{P}(s,a|t)
\textrm{P}(\mathbf{s}|\mathbf{t}) = \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})
\end{displaymath}\\
\vspace{-0.5em}
\visible<3->{
每一种$a$对应一个$\textrm{P}(s,a|t)$
每一种$\mathbf{a}$对应一个$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\vspace{-0.8em}
\begin{center}
......@@ -2639,7 +2639,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\visible<4->{
\node [anchor=south east,inner sep=0pt] (p) at (t0.north west) {\small{{\color{ugreen} P(}}};
\node [anchor=south west,inner sep=0pt] (p2) at ([yshift=0.2em]t2.north east) {\small{{\color{ugreen} )}}};
\node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($s|t$)}};
\node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($\mathbf{s}|\mathbf{t}$)}};
}
}
\end{scope}
......@@ -2652,11 +2652,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% modeling P(s,a|t)
\begin{frame}{建模 - $\textrm{P}(s,a|t)$}
\begin{frame}{建模 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\begin{itemize}
\item \alert{\textbf{进一步建模!!}}:对于源语句子$s=s_1...s_m$($m$个词)、目标语译文$t=t_0...t_n$($n$个词)和词对齐$a=a_1...a_m$,按如下方式计算$\textrm{P}(s,a|t)$
\item \alert{\textbf{进一步建模!!}}:对于源语句子$\mathbf{s}=s_1...s_m$($m$个词)、目标语译文$\mathbf{t}=t_0...t_n$($n$个词)和词对齐$\mathbf{a}=a_1...a_m$,按如下方式计算$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\begin{itemize}
\item 符号定义:$s_x^y=s_x...s_y$, $a_x^y=a_x...a_y$
......@@ -2668,23 +2668,23 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)=$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\visible<2->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
}
\visible<3->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=blue!20] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
}
\visible<4->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
}
\visible<5->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
}
\visible<2->{
......@@ -2706,12 +2706,12 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\vspace{-0.0em}
\begin{itemize}
\item<2-> \textbf{生成模型}:给定译文$t$生成源文$s$和对齐$a$
\item<2-> \textbf{生成模型}:给定译文$\mathbf{t}$生成源文$\mathbf{s}$和对齐$\mathbf{a}$
\begin{enumerate}
\item<2-> 根据译文$t$选择源文的长度$m$
\item<2-> 根据译文$\mathbf{t}$选择源文的长度$m$
\item<3-> 循环源文的每个位置$j$
\item<4-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$,生成第$j$个位置的对齐结果$a_j$
\item<5-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$,生成第$j$个位置的源语言单词$s_j$(注意:这时$a_j$已经生成了)
\item<4-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$,生成第$j$个位置的对齐结果$a_j$
\item<5-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$,生成第$j$个位置的源语言单词$s_j$(注意:这时$a_j$已经生成了)
\end{enumerate}
\end{itemize}
......@@ -2721,9 +2721,9 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% generation of s and a given t - a running example
\begin{frame}{实例 - $\textrm{P}(s,a|t)$}
\begin{frame}{实例 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
$s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-1\}
$\mathbf{s}$ = 在 桌子 上 \ \ \ \ \ $\mathbf{t}$ = $t_0$ on the table \ \ \ \ \ $\mathbf{a}$ = \{1-0,2-3,3-1\}
\begin{center}
\begin{tikzpicture}
......@@ -2789,7 +2789,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
{\small
\begin{eqnarray}
\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \nonumber \\
\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \nonumber \\
& \visible<2->{=} & \visible<2->{\textrm{P}(m=3 \mid \textrm{'$t_0$ on the table'})} \visible<3->{\times} \nonumber \\
& & \visible<3->{\textrm{P}(a_1=0 \mid \phi,\phi,3,\textrm{'$t_0$ on the table'})} \visible<4->{\times} \nonumber \\
& & \visible<4->{\textrm{P}(f_1=\textrm{} \mid \textrm{\{1-0\}},\phi,3,\textrm{'$t_0$ on the table'})} \visible<5->{\times} \nonumber \\
......@@ -2813,14 +2813,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{P}(s|t) & = & \sum_{a} \textrm{P}(s,a|t) \nonumber \\
\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \nonumber
\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \nonumber \\
\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \nonumber
\end{eqnarray}
\item \textbf{两个严重问题}
\begin{enumerate}
\item 第一个公式:如何遍历所有的对齐$a$
\item 第二个公式:如何计算$\textrm{P}(m|t)$$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$
\item 第一个公式:如何遍历所有的对齐$\mathbf{a}$
\item 第二个公式:如何计算$\textrm{P}(m|\mathbf{t})$$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$
\end{enumerate}
\item<2-> Brown等人(1993)的解决方法:对问题进行化简
......@@ -2848,21 +2848,21 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{enumerate}
\item 源语长度概率为常数$\epsilon$
\begin{displaymath}
\textrm{P}(m|t) \equiv \epsilon
\textrm{P}(m|\mathbf{t}) \equiv \epsilon
\end{displaymath}
\item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$仅依赖于译文长度$l+1$(均匀分布)
\item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$仅依赖于译文长度$l+1$(均匀分布)
\begin{displaymath}
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l+1}
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l+1}
\end{displaymath}
\item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$仅依赖与其对齐的译文单词$t_{a_j}$,即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
\item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$仅依赖与其对齐的译文单词$t_{a_j}$,即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
\begin{displaymath}
\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})
\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})
\end{displaymath}
\end{enumerate}
\item<2-> \textbf{核心思想是}把复杂参数化简为简单参数
\begin{itemize}
\item 比如:$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,t)$化简为$l$
\item 比如:$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$化简为$l$
\item \alert{优点}: 模型大大化简;\alert{缺点}:化简导致模型不准确
\end{itemize}
......@@ -2882,12 +2882,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
\node [anchor=west] (eq1part2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
\node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
......@@ -2906,13 +2906,13 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{tikzpicture}
\end{center}
\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\vspace{-1.0em}
\begin{eqnarray}
\textrm{P}(s|t) & = & \sum\limits_{a} \textrm{P}(s,a|t) \nonumber \\
& = & \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum\limits_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \nonumber \\
& = & \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\end{itemize}
......@@ -2925,14 +2925,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item $\textrm{P}(s|t) = \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐,即$\sum_{a}$。这个过程可以被重新表示为
\item $\textrm{P}(\mathbf{s}|\mathbf{t}) = \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐,即$\sum_{\mathbf{a}}$。这个过程可以被重新表示为
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west] (eq2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
\node [anchor=west,inner sep=0] (eq4) at (eq3.east) {...};
......@@ -2968,7 +2968,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{center}
\begin{enumerate}
\item<2-> 遍历所有的的对齐$a$$a$\{$a_1$,...,$a_m$\}组成,每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)
\item<2-> 遍历所有的的对齐$\mathbf{a}$$\mathbf{a}$\{$a_1$,...,$a_m$\}组成,每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)
\vspace{0.5em}
\begin{center}
......@@ -3003,7 +3003,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{center}
\vspace{0.5em}
\item<5-> 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$
\item<5-> 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\end{enumerate}
\end{itemize}
......@@ -3026,28 +3026,28 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{scope}
\node [anchor=west] (s1) at (0,0) {$s$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$t$ = on\ \ the\ \ table};
\node [anchor=west] (s1) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$\mathbf{t}$ = on\ \ the\ \ table};
\draw [->,double,thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south);
\end{scope}
\begin{scope}[xshift=1.5in]
\node [anchor=west] (s2) at (0,0) {$s$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$t'$ = table \ on\ \ the};
\node [anchor=west] (s2) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$\mathbf{t}'$ = table \ on\ \ the};
\draw [->,double,thick,ublue] ([yshift=0.2em]s2.south) -- ([yshift=-0.8em]s2.south);
\end{scope}
\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(s|t)$};
\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(s|t')$};
\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
\node [anchor=west] (comp1) at ([xshift=2.3em]score11.east) {\large{$\mathbf{=}$}};
\node [anchor=east] (label1) at ([xshift=-1em,yshift=0.1em]score11.west) {\textbf{IBM模型1:}};
\visible<2->{
\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(s|t)$};
\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(s|t')$};
\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
\node [anchor=west] (comp2) at ([xshift=2.3em]score21.east) {\large{$\mathbf{>}$}};
\node [anchor=east] (label2) at ([xshift=-1em,yshift=0.1em]score21.west) {\textbf{理想:}};
}
......@@ -3064,7 +3064,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv a(a_j|j,m,l)
\end{displaymath}
其它假设与IBM模型1相同,即$\textrm{P}(m|t) \equiv \epsilon$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})$
其它假设与IBM模型1相同,即$\textrm{P}(m|\mathbf{t}) \equiv \epsilon$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$
\end{itemize}
......@@ -3083,12 +3083,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
\node [anchor=west] (eq1part2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
\node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
......@@ -3104,14 +3104,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{tikzpicture}
\end{center}
\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west] (eq2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
\node [anchor=west,inner sep=0] (eq4) at ([xshift=-0.2em]eq3.east) {...};
......@@ -3142,8 +3142,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-0.5em}
\begin{enumerate}
\item 遍历所有的的对齐$a$
\item 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$,即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
\item 遍历所有的的对齐$\mathbf{a}$
\item 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$,即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
\end{enumerate}
\end{itemize}
......@@ -3158,8 +3158,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\begin{itemize}
......@@ -3227,17 +3227,17 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\begin{spacing}{1.2}
\begin{itemize}
\item 对于翻译模型$\textrm{P}(s|t)$,再来回顾一下统计机器翻译的三个基本问题
\item 对于翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$,再来回顾一下统计机器翻译的三个基本问题
\begin{enumerate}
\item \textbf{建模}:如何描述$\textrm{P}(s|t)$ \visible<2->{\alert{$\gets$ 已解!见上面两个公式}}
\item \textbf{解码}:给定模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$,如何利用上面的公式计算$\textrm{P}(s|t)$(语言模型计算暂不讨论),并找到最佳译文$\hat{t}$ \visible<2->{\alert{$\gets$ 下面讨论}}
\item \textbf{建模}:如何描述$\textrm{P}(\mathbf{s}|\mathbf{t})$ \visible<2->{\alert{$\gets$ 已解!见上面两个公式}}
\item \textbf{解码}:给定模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$,如何利用上面的公式计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(语言模型计算暂不讨论),并找到最佳译文$\hat{\mathbf{t}}$ \visible<2->{\alert{$\gets$ 下面讨论}}
\item \textbf{训练};如何从数据中自动学习模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$ \visible<2->{\alert{$\gets$ 下面讨论}}
\end{enumerate}
......@@ -3281,7 +3281,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(s|t) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
\node [anchor=east] (modellabel) at ([yshift=0.1em]model.west) {\footnotesize{\textbf{问题的统计描述:}}};
\node [anchor=north west] (paras) at (model.south west) {\footnotesize{$\epsilon = ?;\ \ \forall a_j,j,m,l: a(a_j|j,m,l) = ?, f(s_j|t_{a_j}) = ?$}};
\node [anchor=east] (paraslabel) at ([yshift=0.1em]paras.west) {\footnotesize{\textbf{模型的参数:}}};
......@@ -3300,9 +3300,9 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$s$$t$,(高效地)计算$\textrm{P}(s|t)$(同时计算$\textrm{P}(t)$)}};
\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$\mathbf{s}$$\mathbf{t}$,(高效地)计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(同时计算$\textrm{P}(\mathbf{t})$)}};
\node [anchor=east] (scoringlabel) at ([yshift=0.1em]scoring.west) {\footnotesize{\textbf{模型得分计算:}}};
\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$t$,找到模型得分($\textrm{P}(s|t)\textrm{P}(t)$)最高}};
\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$\mathbf{t}$,找到模型得分($\textrm{P}(\mathbf{s}|\mathbf{t})\textrm{P}(\mathbf{t})$)最高}};
\node [anchor=north west] (searchpart2) at ([yshift=0.3em]search.south west) {\footnotesize{的译文输出}};
\node [anchor=east] (searchlabel) at ([yshift=0.1em]search.west) {\footnotesize{\textbf{搜索:}}};
......@@ -3317,7 +3317,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item \textbf{搜索(解码)问题}前面的实例已经描述了一种解法(见{\color{ublue} \hyperref[simpledecodingalgorithm]{\hspace{-0.2em}~\ref{simpledecodingalgorithm}}}):自左向右添加译文单词 + 剪枝技术。这里不再讨论,可以自行学习
\item \textbf{剩下的问题是}:对于任意的$s$$t$,如何\alert{高效地}计算$\textrm{P}(s|t)$
\item \textbf{剩下的问题是}:对于任意的$\mathbf{s}$$\mathbf{t}$,如何\alert{高效地}计算$\textrm{P}(\mathbf{s}|\mathbf{t})$
\end{itemize}
\end{itemize}
......@@ -3333,7 +3333,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\item $O((l+1)^m \cdot m)$ - IBM模型得分的直接计算几乎不可能!
\begin{displaymath}
\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
\end{displaymath}
\item<2-> $O(l \cdot m)$ - 实际上我们可以做的更好
......@@ -3355,7 +3355,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
}
\visible<3->{
\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} $};
\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} $};
\node [anchor=west,inner sep=2pt] (eq2part2) at ([xshift=-0.3em]eq2.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} f(s_j|t_i)$};
\node [anchor=east] (eq2label) at ([xshift=-0em,yshift=0.2em]eq2.west) {\small{IBM模型1:}};
......@@ -3363,7 +3363,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
}
\visible<4->{
\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(s|t) = \epsilon$};
\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon$};
\node [anchor=west,inner sep=2pt] (eq3part2) at ([xshift=-0.3em]eq3.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} a(i|j,m,l) f(s_j|t_i)$};
\node [anchor=east] (eq3label) at ([xshift=-0em,yshift=0.2em]eq3.west) {\small{类似的,IBM模型2:}};
}
......@@ -3525,7 +3525,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item<2-> \textbf{IBM模型的训练:对于给定的句对$(s,t)$,最大化翻译概率$\textrm{P}(s|t)$}。这里用符号$\textrm{P}_{\theta}(s|t)$表示概率由参数$\theta$决定
\item<2-> \textbf{IBM模型的训练:对于给定的句对$(\mathbf{s},\mathbf{t})$,最大化翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}。这里用符号$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$表示概率由参数$\theta$决定
\begin{center}
\begin{tikzpicture}
......@@ -3534,11 +3534,11 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\node [anchor=west] (eq2) at ([yshift=-0.2em]eq1.east) {=};
\node [anchor=west,inner sep=2pt] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
\node [anchor=north,inner sep=1pt] (eq3part2) at ([yshift=-0.2em]eq3.south) {\scriptsize{$\theta$}};
\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};
\visible<3->{
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.35em] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north,draw,inner sep=3pt,fill=red!20] (eq3label) at ([yshift=-1.5em]eq3.south west) {\footnotesize{\textbf{求最优参数}}};
\node [anchor=north,draw,inner sep=3pt,fill=green!20] (eq4label) at ([yshift=-1.5em]eq4.south east) {\footnotesize{\textbf{目标函数}}};
......@@ -3561,23 +3561,23 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item $\textrm{P}(s|t)$可以被看做是$(s,t)$上的\alert{似然}函数($L(s,t;\theta)$)。所谓\alert{极大似然估计},就是要找到使$L(s,t;\theta)$达到最大的$\theta$
\item $\textrm{P}(\mathbf{s}|\mathbf{t})$可以被看做是$(\mathbf{s},\mathbf{t})$上的\alert{似然}函数($L(\mathbf{s},\mathbf{t};\theta)$)。所谓\alert{极大似然估计},就是要找到使$L(\mathbf{s},\mathbf{t};\theta)$达到最大的$\theta$
\vspace{-0.5em}
\begin{displaymath}
\{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(s,t;\theta)\}
\{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(\mathbf{s},\mathbf{t};\theta)\}
\end{displaymath}
\vspace{-0.3em}
$L(s,t;\theta)$表示$L(\cdot)$依赖模型参数$\theta$(注意分号),$\{\hat{\theta}\}$表示可能有多组结果,$\Theta$表示参数空间
$L(\mathbf{s},\mathbf{t};\theta)$表示$L(\cdot)$依赖模型参数$\theta$(注意分号),$\{\hat{\theta}\}$表示可能有多组结果,$\Theta$表示参数空间
\vspace{0.5em}
\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题:如何找到一组$\theta$使$\textrm{P}_{\theta}(s|t)$达到最大?\\
\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题:如何找到一组$\theta$使$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$达到最大?\\
\begin{itemize}
\item \textbf{求函数最大值问题}。比如,我们可以对$\textrm{P}_{\theta}(s|t)$求导,令导数为零,得到极值点
\item \textbf{求函数最大值问题}。比如,我们可以对$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$求导,令导数为零,得到极值点
\end{itemize}
\end{itemize}
......@@ -3641,7 +3641,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
%%%------------------------------------------------------------------------------------------------------------
%%% maximizing P(s|t)
\begin{frame}{最大化$\textrm{P}(s|t)$}
\begin{frame}{最大化$\textrm{P}(\mathbf{s}|\mathbf{t})$}
\begin{itemize}
......@@ -3710,7 +3710,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item \textbf{含有约束的优化问题}: 不好解\\
\textbf{目标:} $\max(\textrm{P}_{\theta}(s|t))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$
\textbf{目标:} $\max(\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t}))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$
\vspace{0.3em}
\item<2-> \textbf{解决方法}: 含有约束优化 $\Rightarrow$ 不含约束优化\\
......@@ -3746,7 +3746,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tabular}{c | c}
\makebox[0.35\textwidth][c]{\textbf{原始问题}} & \makebox[0.35\textwidth][c]{\textbf{转化后的问题}} \\ \hline
$\max (\textrm{P}(s|t))$ & $\max (L(f,\lambda))$ \\
$\max (\textrm{P}(\mathbf{s}|\mathbf{t}))$ & $\max (L(f,\lambda))$ \\
s.t. $\forall t_y: \sum_{s_x} f(s_x|t_y) =1 $ & \\
\end{tabular}
\end{center}
......@@ -4022,11 +4022,11 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi
}
\visible<2->{
\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(s|t)$}}};
\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}}};
}
\visible<3->{
\node [anchor=south west,inner sep=2pt] (label2) at (eq5.north west) {\textbf{\scriptsize{配对的总次数}}};
\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(s,t)$}}};
\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$}}};
}
\visible<4->{
\node [anchor=south west,inner sep=2pt] (label3) at (eq6.north west) {\textbf{\scriptsize{有的$t_i$的相对值}}};
......@@ -4035,7 +4035,7 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi
\visible<2->{
\node [anchor=east,rotate=90] (neweq1) at ([yshift=-0em]eq4.south) {=};
\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(s|t)$}};
\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(\mathbf{s}|\mathbf{t})$}};
}
\visible<5->{
......@@ -4123,16 +4123,16 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
\end{center}
\begin{itemize}
\item<5-> \textbf{定义}:在$\textrm{P}(s|t)$中,$t_v$翻译(连接)到$s_u$的期望频次为
\item<5-> \textbf{定义}:在$\textrm{P}(\mathbf{s}|\mathbf{t})$中,$t_v$翻译(连接)到$s_u$的期望频次为
\vspace{-0.5em}
\begin{displaymath}
c_{\mathbb{E}}(s_u|t_v;s,t) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
\end{displaymath}
\vspace{-0.8em}
\item<6-> \textbf{重写$f(s_u|t_v)$}!!!
\begin{center}
\begin{tikzpicture}
\node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t)$}};
\node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$}};
\end{tikzpicture}
\end{center}
\end{itemize}
......@@ -4144,13 +4144,13 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
\begin{frame}{通过期望频次计算$f(s_u|t_v)$}
\begin{itemize}
\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(s|t)}$
\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(\mathbf{s}|\mathbf{t})}$
\vspace{-1.0em}
\begin{eqnarray}
f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber \\
& = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber
f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber \\
& = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber
\end{eqnarray}
\item<2-> \textbf{$\lambda_{t_v}^{'}$究竟是什么?} - 回忆一下IBM模型对$f(\cdot|\cdot)$的约束
......@@ -4163,7 +4163,7 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\vspace{-0.3em}
\begin{displaymath}
\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)
\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})
\end{displaymath}
\vspace{-0.6em}
......@@ -4173,8 +4173,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$f(s_u|t_v) =$};
\draw [-] (eq1.east) -- ([xshift=8em]eq1.east);
\node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;s,t)$};
\node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)$};
\node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};
\node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};
\begin{pgfonlayer}{background}
\node[rectangle,draw,red,thick,inner sep=0] [fit = (eq1) (eq2) (eq3)] {};
......@@ -4190,9 +4190,9 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
%%% scale it up to the full corpus
\begin{frame}{在整个数据集上计算}
\begin{itemize}
\item \textbf{更真实的情况}:我们拥有一系列互译的句对(称作\alert{平行语料}),记为$\{(s^{[1]},t^{[1]}),(s^{[2]},t^{[2]}),...,(s^{[N]},t^{[N]})\}$。对于这$N$个训练用句对,定义$f(s_u|t_v)$的期望频次为
\item \textbf{更真实的情况}:我们拥有一系列互译的句对(称作\alert{平行语料}),记为$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),(\mathbf{s}^{[2]},\mathbf{t}^{[2]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$。对于这$N$个训练用句对,定义$f(s_u|t_v)$的期望频次为
\begin{displaymath}
c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})
c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})
\end{displaymath}
\item<2-> \textbf{于是}
\begin{center}
......@@ -4200,8 +4200,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\node [anchor=west,inner sep=2pt] (eq1) at (0,0) {$f(s_u|t_v)$};
\node [anchor=west] (eq2) at (eq1.east) {$=$\ };
\draw [-] ([xshift=0.3em]eq2.east) -- ([xshift=11.6em]eq2.east);
\node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
\node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
\node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};
\node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};
\visible<4->{
\node [anchor=south] (label1) at ([yshift=-6em,xshift=3em]eq1.north west) {利用这个公式计算};
......@@ -4250,17 +4250,17 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\label{ibmtraining}
\begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{IBM模型1的训练(EM算法)}
输入: 平行语料$\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$\\
输入: 平行语料$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$\\
输出:参数$f(\cdot|\cdot)$的最优值\\
1: \textbf{Function} \textsc{TrainItWithEM}($\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$) \\
1: \textbf{Function} \textsc{TrainItWithEM}($\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$) \\
2: \ \ Initialize $f(\cdot|\cdot)$ \hspace{5em} $\rhd$ 比如给$f(\cdot|\cdot)$一个均匀分布\\
3: \ \ Loop until $f(\cdot|\cdot)$ converges\\
4: \ \ \ \ \textbf{foreach} $k = 1$ to $N$ \textbf{do}\\
5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) = \sum\limits_{j=1}^{|s^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|t^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{t^{[1]},...,t^{[N]}\}$ \textbf{do}\\
7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$\\
8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{s^{[1]},...,s^{[N]}\}$ \textbf{do}\\
9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) = \sum\limits_{j=1}^{|\mathbf{s}^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|\mathbf{t}^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{\mathbf{t}^{[1]},...,\mathbf{t}^{[N]}\}$ \textbf{do}\\
7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})$\\
8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{\mathbf{s}^{[1]},...,\mathbf{s}^{[N]}\}$ \textbf{do}\\
9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
10: \ \textbf{return} $f(\cdot|\cdot)$
\end{beamerboxesrounded}
\vspace{-0.3em}
......@@ -4280,15 +4280,15 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\end{itemize}
\end{itemize}
\begin{enumerate}
\item \textbf{E-Step} (对于句对$(s,t)$$m=|s|,l=|t|$)
\item \textbf{E-Step} (对于句对$(\mathbf{s},\mathbf{t})$$m=|\mathbf{s}|,l=|\mathbf{t}|$)
\begin{eqnarray}
c_{\mathbb{E}}(s_u|t_v;s,t) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
c_{\mathbb{E}}(i|j,m,l;s,t) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
c_{\mathbb{E}}(i|j,m,l;\mathbf{s},\mathbf{t}) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
\end{eqnarray}
\item \textbf{M-Step}
\begin{eqnarray}
f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})} \nonumber \\
a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})} \nonumber
f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \nonumber \\
a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \nonumber
\end{eqnarray}
\end{enumerate}
\end{frame}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论