Commit e10537de by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !273
parents 35ddc010 b64ef0db
......@@ -104,7 +104,7 @@
%\visible<3->
{
% coverage score formula node
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_j$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{`` 没''}) + ..$}};
}
......
......@@ -304,8 +304,8 @@
\visible<3->{
\begin{center}
\begin{tikzpicture}
\node [anchor=south west, fill=red, minimum width=1.5cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
\node [anchor=south west, fill=ugreen, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{}}};
\node [anchor=south west, fill=red!50, minimum width=1.5cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
\node [anchor=south west, fill=blue!50, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{}}};
\node [anchor=south] (mtscore) at (mt.north) {3.9};
\node [anchor=south] (humanscore) at (human.north) {4.7};
\draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=0.5cm]human.south east);
......@@ -321,8 +321,8 @@
\visible<4->{
\begin{center}
\begin{tikzpicture}
\node [anchor=south west, fill=red, minimum width=1.5cm, minimum height=1.5cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
\node [anchor=south west, fill=ugreen, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{}}};
\node [anchor=south west, fill=red!50, minimum width=1.5cm, minimum height=1.5cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
\node [anchor=south west, fill=blue!50, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{}}};
\node [anchor=south] (mtscore) at (mt.north) {47\%};
\node [anchor=south] (humanscore) at (human.north) {100\%};
\draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=0.5cm]human.south east);
......
......@@ -775,7 +775,7 @@
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -808,18 +808,18 @@
\end{tikzpicture}
\item<3-> \textbf{步骤1}:构建单词翻译表 - 翻译词典\\
\small{对于任意的源语言单词$x$,要获得它所有可能的译文$Y$。给定一个互译句对$(s,t)$,对于$y \in Y$,定义$\textrm{P}(x \leftrightarrow y; s, t)$表示$x$$y$$(x,y)$中互译的概率,我们用$x$$y$的联合概率表示:
\small{对于任意的源语言单词$x$,要获得它所有可能的译文$Y$。给定一个互译句对$(\mathbf{s},\mathbf{t})$,对于$y \in Y$,定义$\textrm{P}(x \leftrightarrow y; \mathbf{s}, \mathbf{t})$表示$x$$y$$(x,y)$中互译的概率,我们用$x$$y$的联合概率表示:
\vspace{-2.0em}
\begin{eqnarray}
\textrm{P}(x \leftrightarrow y; s,t) & \equiv & \textrm{P}(x,y;s,t) \nonumber \\
& = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
\textrm{P}(x \leftrightarrow y; \mathbf{s},\mathbf{t}) & \equiv & \textrm{P}(x,y;\mathbf{s},\mathbf{t}) \nonumber \\
& = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
\end{eqnarray}
\vspace{-0.5em}
$c(x,y;s,t)$表示$(x,y)$$(s,t)$中共现的次数; $\sum_{x',y'} c(x',y';s,t)$表示$(s,t)$中任意源/译文单词共现的总次数
$c(x,y;\mathbf{s},\mathbf{t})$表示$(x,y)$$(\mathbf{s},\mathbf{t})$中共现的次数; $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})$表示$(\mathbf{s},\mathbf{t})$中任意源/译文单词共现的总次数
}
\end{itemize}
......@@ -831,7 +831,7 @@
\begin{frame}{实现一个简单的机器翻译系统:学习单词翻译概率(2)}
\vspace{-1em}
\begin{eqnarray}
\textrm{P}(x,y;s,t) & = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
\textrm{P}(x,y;\mathbf{s},\mathbf{t}) & = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
\end{eqnarray}
\vspace{-0.5em}
......@@ -839,7 +839,7 @@
\begin{flushleft}
\begin{tikzpicture}
\node [anchor=west] (s) at (0,0) {$s=$};
\node [anchor=west] (s) at (0,0) {$\mathbf{s}=$};
\node [anchor=center] (sw1) at ([xshift=1em]s.east) {机器};
\visible<1,4->{
\node [anchor=center] (sw2) at ([xshift=1.3em]sw1.east) {翻译};
......@@ -849,7 +849,7 @@
\node [anchor=center] (sw4) at ([xshift=1.0em]sw3.east) {翻译};
}
\node [anchor=north west] (t) at (s.south west) {$t=$};
\node [anchor=north west] (t) at (s.south west) {$\mathbf{t}=$};
\node [anchor=center] (tw1) at ([xshift=1.8em]t.east) {machine};
\visible<1,3,5->{
\node [anchor=center] (tw2) at ([xshift=2.2em]tw1.east) {translation};
......@@ -880,8 +880,8 @@
\begin{itemize}
\item $c(\textrm{'翻译'},\textrm{'translation'};s,t)=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
\item<6-> $\sum_{x',y'} c(x',y';s,t)= \textrm{使劲数...} = 63\visible<7->{ = 9 \times 7 = |s| \times |t|}$
\item $c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t})=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
\item<6-> $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})= \textrm{使劲数...} = 63\visible<7->{ = 9 \times 7 = |\mathbf{s}| \times |\mathbf{t}|}$
\vspace{0.3em}
\begin{itemize}
\item<7-> $|\cdot|$表示句子长度
......@@ -889,14 +889,14 @@
\vspace{0.3em}
\item<8-> '翻译'和'translation'的互译概率为
\begin{displaymath}
\textrm{P}(\textrm{'翻译'},\textrm{'translation'};s,t) = 4/63
\textrm{P}(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t}) = 4/63
\end{displaymath}
\vspace{-0.5em}
类似的
\vspace{-0.5em}
\begin{eqnarray}
\textrm{P}(\textrm{'机器'},\textrm{'translation'};s,t) & = & 2/63 \nonumber \\
\textrm{P}(\textrm{'机器'},\textrm{'look'};s,t) & = & 0/63 \nonumber
\textrm{P}(\textrm{'机器'},\textrm{'translation'};\mathbf{s},\mathbf{t}) & = & 2/63 \nonumber \\
\textrm{P}(\textrm{'机器'},\textrm{'look'};\mathbf{s},\mathbf{t}) & = & 0/63 \nonumber
\end{eqnarray}
\end{itemize}
......@@ -909,12 +909,12 @@
\begin{itemize}
\item 很多时候,我们有多个互译句对$(s^{[1]},t^{[1]}),...,(s^{[n]},t^{[n]})$,称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为
\item 很多时候,我们有多个互译句对$(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[n]},\mathbf{t}^{[n]})$,称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为
\vspace{-1em}
\begin{eqnarray}
\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;s^{[i]},t^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';s^{[i]},t^{[i]})} \nonumber
\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;\mathbf{s}^{[i]},\mathbf{t}^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';\mathbf{s}^{[i]},\mathbf{t}^{[i]})} \nonumber
\end{eqnarray}
\item<2-> 说白了就是计算$(x,y)$的频次时,在每个句子上累加
......@@ -922,11 +922,11 @@
\begin{flushleft}
\begin{tikzpicture}
\node [anchor=west] (s1) at (0,0) {$s_1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$t_1=$ Machine translation is just translation by computer};
\node [anchor=west] (s1) at (0,0) {$\mathbf{s}^1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$\mathbf{t}^1=$ Machine translation is just translation by computer};
\node [anchor=north west] (s2) at (t1.south west) {$s_2=$ 那 人工 翻译 呢 ?};
\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$t_2=$ So , what is human translation ?};
\node [anchor=north west] (s2) at (t1.south west) {$\mathbf{s}^2=$ 那 人工 翻译 呢 ?};
\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$\mathbf{t}^2=$ So , what is human translation ?};
\end{tikzpicture}
\end{flushleft}
......@@ -936,8 +936,8 @@
{\footnotesize
\begin{eqnarray}
& & \textrm{P}(\textrm{'翻译'},\textrm{'translation'}) \nonumber \\
& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};s^{[1]},t^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};s^{[2]},t^{[2]})}{\sum_{x',y'} c(x',y';s^{[1]},t^{[1]}) + \sum_{x',y'} c(x',y';s^{[2]},t^{[2]})} \nonumber \\
\visible<3->{& = & \frac{4 + 1}{|s^{[1]}| \times |t^{[1]}| + |s^{[2]}| \times |t^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[1]},\mathbf{t}^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[2]},\mathbf{t}^{[2]})}{\sum_{x',y'} c(x',y';\mathbf{s}^{[1]},\mathbf{t}^{[1]}) + \sum_{x',y'} c(x',y';\mathbf{s}^{[2]},\mathbf{t}^{[2]})} \nonumber \\
\visible<3->{& = & \frac{4 + 1}{|\mathbf{s}^{[1]}| \times |\mathbf{t}^{[1]}| + |\mathbf{s}^{[2]}| \times |\mathbf{t}^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
\end{eqnarray}
}
......@@ -964,7 +964,7 @@
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -992,14 +992,14 @@
\end{center}
\begin{itemize}
\item \textbf{步骤2}: 对任意的句对$(s,t)$计算句子级翻译概率$\textrm{P}(t|s)$ \\
\item \textbf{步骤2}: 对任意的句对$(\mathbf{s},\mathbf{t})$计算句子级翻译概率$\textrm{P}(\mathbf{t}|\mathbf{s})$ \\
\vspace{0.5em}
\visible<2->{
用一种比较简单的思路:定义$(s,t)$上的一种分数$g(s,t)$
用一种比较简单的思路:定义$(\mathbf{s},\mathbf{t})$上的一种分数$g(\mathbf{s},\mathbf{t})$
\begin{itemize}
\item $g(s,t)$的值越大翻译质量越好
\item $g(s,t)$的值越小翻译质量越差
\item $g(\mathbf{s},\mathbf{t})$的值越大翻译质量越好
\item $g(\mathbf{s},\mathbf{t})$的值越小翻译质量越差
\end{itemize}
}
......@@ -1009,10 +1009,10 @@
于是,我们进一步定义
\begin{displaymath}
\textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}
\textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}
\end{displaymath}
实际上就是对$g(s,t)$在所有可能的译文集合上做归一化,使其具有概率意义
实际上就是对$g(\mathbf{s},\mathbf{t})$在所有可能的译文集合上做归一化,使其具有概率意义
}
\end{itemize}
......@@ -1030,11 +1030,11 @@
\item 两个问题
\begin{enumerate}
\item 如何计算$g(s,t)$ \visible<2->{- 最关键的建模问题,马上开始}
\item 如何计算$\sum_{t'} g(s,t')$ \visible<2->{- 实际上\alert{不用计算},后面再说}
\item 如何计算$g(\mathbf{s},\mathbf{t})$ \visible<2->{- 最关键的建模问题,马上开始}
\item 如何计算$\sum_{\mathbf{t}'} g(\mathbf{s},\mathbf{t}')$ \visible<2->{- 实际上\alert{不用计算},后面再说}
\end{enumerate}
\item<3-> \textbf{$g(s,t)$建模: }根据本章第一页的假设,$s$$t$之间存在一种单词间的对应,我们称之为\alert{词对齐}关系
\item<3-> \textbf{$g(\mathbf{s},\mathbf{t})$建模: }根据本章第一页的假设,$\mathbf{s}$$\mathbf{t}$之间存在一种单词间的对应,我们称之为\alert{词对齐}关系
\begin{center}
\begin{tikzpicture}
......@@ -1045,7 +1045,7 @@
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-3.0em]
......@@ -1054,7 +1054,7 @@
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
\end{scope}
......@@ -1082,9 +1082,9 @@
\begin{frame}{实现一个简单的机器翻译系统:句子级翻译模型(3)}
\begin{itemize}
\item 给定一个句对$(s,t)$,及它们之间的(最优)词对齐$\hat{A}$,可以定义模型得分为:
\item 给定一个句对$(\mathbf{s},\mathbf{t})$,及它们之间的(最优)词对齐$\hat{A}$,可以定义模型得分为:
\begin{displaymath}
g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
g(\mathbf{s},\mathbf{t}) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\end{displaymath}
显然每个单词翻译概率都高,那么整句的模型得分也高
......@@ -1097,7 +1097,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-3.0em]
......@@ -1106,7 +1106,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
\end{scope}
......@@ -1122,7 +1122,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
\vspace{-2.5em}
\begin{eqnarray}
g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
g(\mathbf{s},\mathbf{t}) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
& & \textrm{P}(\textrm{'感到','am'}) \times \textrm{P}(\textrm{'满意','satisfied'}) \nonumber
\end{eqnarray}
......@@ -1141,13 +1141,13 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\begin{frame}{实现一个简单的机器翻译系统:句子级翻译模型(4)}
\begin{itemize}
\item \textbf{但是},这样设计的$g(s,t)$没有考虑词序的信息。相同译词出现在不同的位置,得分相同 - 无法选择流畅的译文
\item \textbf{但是},这样设计的$g(\mathbf{s},\mathbf{t})$没有考虑词序的信息。相同译词出现在不同的位置,得分相同 - 无法选择流畅的译文
\vspace{0.5em}
\begin{tabular}{l | l }
& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(t)$}}} \\ \hline
& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(\mathbf{t})$}}} \\ \hline
\begin{tikzpicture}
......@@ -1160,7 +1160,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-2.6em]
......@@ -1169,7 +1169,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t'=$};
\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}'=$};
\end{scope}
......@@ -1197,7 +1197,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {\footnotesize{$_3$}};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
\node [anchor=east] (s) at (s1.west) {$s=$};
\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
\end{scope}
\begin{scope}[yshift=-2.6em]
......@@ -1206,7 +1206,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\node [anchor=center] (t3) at ([yshift=-1.7em]s3.south) {you\footnotesize{$_3$}};
\node [anchor=center] (t4) at ([yshift=-1.7em]s4.south) {am\footnotesize{$_4$}};
\node [anchor=center] (t5) at ([yshift=-1.6em]s5.south) {satisfied\footnotesize{$_5$}};
\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$t''=$};
\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$\mathbf{t}''=$};
\end{scope}
......@@ -1225,7 +1225,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
\end{tabular}
\item<2-> \textbf{解决方案}:引入语言模型$\textrm{P}_{lm}(t)$来度量译文的流畅度
\item<2-> \textbf{解决方案}:引入语言模型$\textrm{P}_{lm}(\mathbf{t})$来度量译文的流畅度
$\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 | w_1) \times \textrm{P}(w_3 | w_2) ... \times \textrm{P}(w_m | w_{m-1})$
......@@ -1234,7 +1234,7 @@ $\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 |
\vspace{-1em}
\begin{displaymath}
g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(t)
g(\mathbf{s},\mathbf{t})=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(\mathbf{t})
\end{displaymath}
\end{itemize}
......@@ -1260,7 +1260,7 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\end{pgfonlayer}
}
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
\node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};
\begin{pgfonlayer}{background}
......@@ -1287,16 +1287,16 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\end{center}
\begin{itemize}
\item \textbf{步骤3:解码 - }对任意的$s$,找到翻译概率最大的译文$\hat{t}$
\item \textbf{步骤3:解码 - }对任意的$\mathbf{s}$,找到翻译概率最大的译文$\hat{\mathbf{t}}$
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(t|s)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
\end{displaymath}
这里$\argmax_{a} f(a)$表示找到使$f(a)$达到最大的$a$输出
这里$\argmax_{\mathbf{a}} f(\mathbf{a})$表示找到使$f(\mathbf{a})$达到最大的$\mathbf{a}$输出
\item<2-> 现在我们可以对任意的$(s,t)$计算$\textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}$
\item<2-> 现在我们可以对任意的$(\mathbf{s},\mathbf{t})$计算$\textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}$
\begin{itemize}
\item 给定$s$$\sum_{t'}g(s,t')$是个常数(因为$\sum_{t'}g(s,t')$的变量只有$s$)
\item 给定$\mathbf{s}$$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$是个常数(因为$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$的变量只有$\mathbf{s}$)
\item \textbf{这样,我们得到解码步骤的形式化描述为}
\end{itemize}
......@@ -1304,8 +1304,8 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\vspace{-1em}
\begin{eqnarray}
\hat{t} & = & \argmax_{t} \frac{g(s,t)}{\sum_{t'}g(s,t')} \nonumber \\
& = & \argmax_{t} g(s,t) \nonumber
\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')} \nonumber \\
& = & \argmax_{\mathbf{t}} g(\mathbf{s},\mathbf{t}) \nonumber
\end{eqnarray}
\end{itemize}
......@@ -1317,15 +1317,15 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
\begin{frame}{实现一个简单的机器翻译系统:解码(2)}
\vspace{0.5em}
\begin{itemize}
\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(s,t)$达到最大的译文\\
\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(\mathbf{s},\mathbf{t})$达到最大的译文\\
\vspace{-1em}
\begin{minipage}[t]{0.58\linewidth}
\begin{itemize}
\item$s$$m$个词,每个词有$n$个翻译候选 - 共有$n^m$种组合
\item$\mathbf{s}$$m$个词,每个词有$n$个翻译候选 - 共有$n^m$种组合
\vspace{-0.5em}
\item<2-> 词的翻译候选可以任意调序
\vspace{-0.5em}
\item<3-> $s$对应可能的译文至少有$n^m \cdot m!$
\item<3-> $\mathbf{s}$对应可能的译文至少有$n^m \cdot m!$
\end{itemize}
\end{minipage}
\hfill
......@@ -1408,8 +1408,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
{\scriptsize
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-1pt]line2.south west) {\textrm{2: $best = \phi$}};
\node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-1pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
\node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-1pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
......@@ -1421,7 +1421,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\node [anchor=north west,inner sep=2pt,align=left] (line11) at ([yshift=-1pt]line10.south west) {\textrm{10: \textbf{return} $best.translatoin$}};
\node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=1pt]line1.north west) {输出: 找的最佳译文};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};
}
......@@ -1602,8 +1602,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
{\tiny
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
\node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-3pt]line2.south west) {\textrm{2: $best = \phi$}};
\node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-3pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
\node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-3pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
......@@ -1620,7 +1620,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
}
\node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=3pt]line1.north west) {输出: 找的最佳译文};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};
}
......@@ -1900,7 +1900,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
}
\visible<6->{
\node [anchor=north west] (glabel) at (hlabel.south west) {$g(s,t)$};
\node [anchor=north west] (glabel) at (hlabel.south west) {$g(\mathbf{s},\mathbf{t})$};
\node [anchor=west] (translabel) at (glabel.east) {翻译结果};
\draw [-] (glabel.north east) -- ([yshift=-1.9in]glabel.north east);
\draw [-] (glabel.south west) -- ([xshift=3.5in]glabel.south west);
......@@ -2206,7 +2206,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{机器翻译的统计建模}
\begin{itemize}
\item \textbf{一个人在做翻译时}:对于给定的源语言句子$s$,可以了翻译为一个(或者若干个)正确的译文$\hat{t}$
\item \textbf{一个人在做翻译时}:对于给定的源语言句子$\mathbf{s}$,可以了翻译为一个(或者若干个)正确的译文$\hat{\mathbf{t}}$
\begin{itemize}
\item 也就是说除了正确的译文,其它的翻译都是不正确的
\end{itemize}
......@@ -2214,20 +2214,20 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{t}$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{\mathbf{t}}$}};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t.north west) node[pos=0.5,below] {\tiny{正确翻译}};
\end{tikzpicture}
\end{center}
\item<2-> \textbf{统计机器翻译的思想是}:对于$s$,所有可能的目标语词串$t$都是可能的译文。每一对($s$,$t$)都有一个概率值$\textrm{P}(t|s)$ 来描述$s$ 翻译为$t$的好与坏
\item<2-> \textbf{统计机器翻译的思想是}:对于$\mathbf{s}$,所有可能的目标语词串$\mathbf{t}$都是可能的译文。每一对($\mathbf{s}$,$\mathbf{t}$)都有一个概率值$\textrm{P}(\mathbf{t}|\mathbf{s})$ 来描述$\mathbf{s}$ 翻译为$\mathbf{t}$的好与坏
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t1) at ([xshift=1in]s.east) {\black{$t_1$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t2) at ([xshift=3em,yshift=2em]t1.north east) {\black{$t_2$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t3) at ([xshift=1em,yshift=4em]t1.north east) {\black{$t_3$}};
......@@ -2237,10 +2237,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t6) at ([xshift=3em]t2.east) {};
\node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t7) at ([xshift=3em]t4.east) {};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|s$)=0.1}};
\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|s$)=0.2}};
\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|s$)=0.3}};
\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|s$)=0.1}};
\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|\mathbf{s}$)=0.1}};
\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|\mathbf{s}$)=0.2}};
\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|\mathbf{s}$)=0.3}};
\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|\mathbf{s}$)=0.1}};
\end{tikzpicture}
\end{center}
......@@ -2254,13 +2254,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{噪声信道模型}
\begin{itemize}
\item \textbf{噪声信道模型}:源语言句子$s$(信宿)是由目标语句子$t$(信源)经过一个有噪声的信道得到的。如果知道了$s$和信道的性质,我们可以通过$\textrm{P}(t|s)$得到可能的信源的概率。\\
\item \textbf{噪声信道模型}:源语言句子$\mathbf{s}$(信宿)是由目标语句子$\mathbf{t}$(信源)经过一个有噪声的信道得到的。如果知道了$\mathbf{s}$和信道的性质,我们可以通过$\textrm{P}(\mathbf{t}|\mathbf{s})$得到可能的信源的概率。\\
\begin{center}
\begin{tikzpicture}
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$t$}};
\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$\mathbf{t}$}};
\draw [<->,thick,] (s.east) -- (t.west) node [pos=0.5,draw,fill=white] {噪声信道};
\node [anchor=east] at (s.west) {\scriptsize{信宿}};
......@@ -2272,13 +2272,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
而通过上述过程找到最可能的信源的过程被称之为\alert{解码}
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(t|s)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
\end{displaymath}
\item<2-> \textbf{贝叶斯变换}
\begin{eqnarray}
\textrm{P}(t|s) & = & \frac{\textrm{P}(s,t)}{\textrm{P}(s)} \nonumber \\
& = & \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber
\textrm{P}(\mathbf{t}|\mathbf{s}) & = & \frac{\textrm{P}(\mathbf{s},\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
& = & \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber
\end{eqnarray}
\end{itemize}
......@@ -2291,26 +2291,26 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (p) at (0,0) {$\textrm{P}(t|s)$};
\node [anchor=west] (p) at (0,0) {$\textrm{P}(\mathbf{t}|\mathbf{s})$};
\node [anchor=west] (eqiv) at (p.east) {=};
\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};
\visible<2->{
\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};
}
\draw [-] ([yshift=-4pt]transmodel.south west) -- ([yshift=-4pt]lmmodel.south east);
\visible<2->{
\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$t$,得到信宿$s$ }\\\footnotesize{的概率,称作\textbf{翻译模型}}};
\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$\mathbf{t}$,得到信宿$\mathbf{s}$ }\\\footnotesize{的概率,称作\textbf{翻译模型}}};
\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$t$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};
\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$\mathbf{t}$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};
\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$s$出现的概率}\\\footnotesize{给定$s$$\textrm{P}(s)$\textbf{常量}}};
\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$\mathbf{s}$出现的概率}\\\footnotesize{给定$\mathbf{s}$$\textrm{P}(\mathbf{s})$\textbf{常量}}};
\draw [->,thick] (transmodel.north) .. controls +(north:1.5em) and + (south:1.5em) .. (tmmark.south);
\draw [->,thick] (lmmodel.north) .. controls +(north:1.5em) and + (south:1.3em) .. (lmmark.south);
......@@ -2326,11 +2326,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\vspace{-0.5em}
\begin{eqnarray}
\hat{t} & = & \argmax_{t} \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber \\
& = & \argmax_{t} \textrm{P}(s|t) \textrm{P}(t) \nonumber
\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
& = & \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t}) \nonumber
\end{eqnarray}
即,在所有可能的译文中找到使翻译模型$\textrm{P}(s|t)$和语言模型$\textrm{P}(t)$乘积最大的译文
即,在所有可能的译文中找到使翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$和语言模型$\textrm{P}(\mathbf{t})$乘积最大的译文
\end{itemize}
\
......@@ -2344,28 +2344,28 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{frame}{基本问题}
\begin{displaymath}
\hat{t} = \argmax_{t} \textrm{P}(s|t) \textrm{P}(t)
\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})
\end{displaymath}
\begin{itemize}
\item \textbf{三个基本问题}
\begin{enumerate}
\item \textbf{建模}:如何描述计算$\textrm{P}(s|t)$$\textrm{P}(t)$的计算方式
\item \textbf{训练}:如何获得计算$\textrm{P}(s|t)$$\textrm{P}(t)$所需的参数
\item \textbf{建模}:如何描述计算$\textrm{P}(\mathbf{s}|\mathbf{t})$$\textrm{P}(\mathbf{t})$的计算方式
\item \textbf{训练}:如何获得计算$\textrm{P}(\mathbf{s}|\mathbf{t})$$\textrm{P}(\mathbf{t})$所需的参数
\item \textbf{解码}:如何完成搜索最优解的过程$argmax$
\end{enumerate}
\item<2-> 回忆一下本章开始的实例,是不是有似曾相识的感觉?
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (e1) at (0,0) {$g(s,t)$};
\node [anchor=west] (e1) at (0,0) {$g(\mathbf{s},\mathbf{t})$};
\node [anchor=west] (e2) at (e1.east) {$=$};
\node [anchor=west,inner sep=2pt,fill=red!20] (e3) at (e2.east) {$\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$};
\node [anchor=west,inner sep=1pt] (e4) at (e3.east) {$\times$};
\node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(t)$};
\node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(s|t)$};
\node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(\mathbf{t})$};
\node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (n1part2) at ([yshift=0.3em]n1.south) {\scriptsize{\textbf{翻译模型}}};
\node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(t)$};
\node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(\mathbf{t})$};
\node [anchor=north] (n2part2) at ([yshift=0.3em]n2.south) {\scriptsize{\textbf{语言模型}}};
\draw [->,thick] (e3.south) .. controls +(south:1em) and +(north:1em) .. (n1.north);
\draw [->,thick] (e5.south) .. controls +(south:1em) and +(70:1em) .. (n2.north);
......@@ -2389,13 +2389,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\label{ibmmodelingstart}
\begin{itemize}
\item \textbf{$\textrm{P}(t)$和解码}在前面的内容中有介绍,下面重点求解$\textrm{P}(s|t)$,即:
\item \textbf{$\textrm{P}(\mathbf{t})$和解码}在前面的内容中有介绍,下面重点求解$\textrm{P}(\mathbf{s}|\mathbf{t})$,即:
\begin{itemize}
\item \textbf{翻译模型建模} - $\textrm{P}(s|t)$的计算方法
\item \textbf{翻译模型参数估计} - 计算$\textrm{P}(s|t)$所需的参数
\item \textbf{翻译模型建模} - $\textrm{P}(\mathbf{s}|\mathbf{t})$的计算方法
\item \textbf{翻译模型参数估计} - 计算$\textrm{P}(\mathbf{s}|\mathbf{t})$所需的参数
\end{itemize}
\vspace{0.5em}
\item<2-> \textbf{IBM模型的假设}$s=s_1...s_m$$t=t_1...t_n$之间有单词一级的对应,称作\alert{单词对齐}或者\alert{词对齐}。此外:
\item<2-> \textbf{IBM模型的假设}$\mathbf{s}=s_1...s_m$$\mathbf{t}=t_1...t_n$之间有单词一级的对应,称作\alert{单词对齐}或者\alert{词对齐}。此外:
\begin{itemize}
\item \textbf{约束}:一个源语言单词只能对应一个目标语单词
\vspace{0.5em}
......@@ -2462,10 +2462,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% modeling P(s|t)
\begin{frame}{建模 - $\textrm{P}(s|t)$}
\begin{frame}{建模 - $\textrm{P}(\mathbf{s}|\mathbf{t})$}
\begin{itemize}
\item 给定$s$$t$,它们之间的\alert{词对齐}被记为$a=a_1...a_m$
\item 给定$\mathbf{s}$$\mathbf{t}$,它们之间的\alert{词对齐}被记为$\mathbf{a}=a_1...a_m$
\begin{itemize}
\item $a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置
\begin{center}
......@@ -2486,15 +2486,15 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\end{tikzpicture}
\end{center}
\end{itemize}
\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(s|t)$被表示为所有可能的词对齐的生成概率\\
\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(\mathbf{s}|\mathbf{t})$被表示为所有可能的词对齐的生成概率\\
\vspace{-0.5em}
\begin{displaymath}
\textrm{P}(s|t) = \sum_{a} \textrm{P}(s,a|t)
\textrm{P}(\mathbf{s}|\mathbf{t}) = \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})
\end{displaymath}\\
\vspace{-0.5em}
\visible<3->{
每一种$a$对应一个$\textrm{P}(s,a|t)$
每一种$\mathbf{a}$对应一个$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\vspace{-0.8em}
\begin{center}
......@@ -2639,7 +2639,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\visible<4->{
\node [anchor=south east,inner sep=0pt] (p) at (t0.north west) {\small{{\color{ugreen} P(}}};
\node [anchor=south west,inner sep=0pt] (p2) at ([yshift=0.2em]t2.north east) {\small{{\color{ugreen} )}}};
\node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($s|t$)}};
\node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($\mathbf{s}|\mathbf{t}$)}};
}
}
\end{scope}
......@@ -2652,11 +2652,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% modeling P(s,a|t)
\begin{frame}{建模 - $\textrm{P}(s,a|t)$}
\begin{frame}{建模 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\begin{itemize}
\item \alert{\textbf{进一步建模!!}}:对于源语句子$s=s_1...s_m$($m$个词)、目标语译文$t=t_0...t_n$($n$个词)和词对齐$a=a_1...a_m$,按如下方式计算$\textrm{P}(s,a|t)$
\item \alert{\textbf{进一步建模!!}}:对于源语句子$\mathbf{s}=s_1...s_m$($m$个词)、目标语译文$\mathbf{t}=t_0...t_n$($n$个词)和词对齐$\mathbf{a}=a_1...a_m$,按如下方式计算$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\begin{itemize}
\item 符号定义:$s_x^y=s_x...s_y$, $a_x^y=a_x...a_y$
......@@ -2668,23 +2668,23 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)=$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\visible<2->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
}
\visible<3->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=blue!20] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
}
\visible<4->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
}
\visible<5->{
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
}
\visible<2->{
......@@ -2706,12 +2706,12 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
\vspace{-0.0em}
\begin{itemize}
\item<2-> \textbf{生成模型}:给定译文$t$生成源文$s$和对齐$a$
\item<2-> \textbf{生成模型}:给定译文$\mathbf{t}$生成源文$\mathbf{s}$和对齐$\mathbf{a}$
\begin{enumerate}
\item<2-> 根据译文$t$选择源文的长度$m$
\item<2-> 根据译文$\mathbf{t}$选择源文的长度$m$
\item<3-> 循环源文的每个位置$j$
\item<4-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$,生成第$j$个位置的对齐结果$a_j$
\item<5-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$,生成第$j$个位置的源语言单词$s_j$(注意:这时$a_j$已经生成了)
\item<4-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$,生成第$j$个位置的对齐结果$a_j$
\item<5-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$,生成第$j$个位置的源语言单词$s_j$(注意:这时$a_j$已经生成了)
\end{enumerate}
\end{itemize}
......@@ -2721,9 +2721,9 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
%%%------------------------------------------------------------------------------------------------------------
%%% generation of s and a given t - a running example
\begin{frame}{实例 - $\textrm{P}(s,a|t)$}
\begin{frame}{实例 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
$s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-1\}
$\mathbf{s}$ = 在 桌子 上 \ \ \ \ \ $\mathbf{t}$ = $t_0$ on the table \ \ \ \ \ $\mathbf{a}$ = \{1-0,2-3,3-1\}
\begin{center}
\begin{tikzpicture}
......@@ -2789,7 +2789,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
{\small
\begin{eqnarray}
\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \nonumber \\
\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \nonumber \\
& \visible<2->{=} & \visible<2->{\textrm{P}(m=3 \mid \textrm{'$t_0$ on the table'})} \visible<3->{\times} \nonumber \\
& & \visible<3->{\textrm{P}(a_1=0 \mid \phi,\phi,3,\textrm{'$t_0$ on the table'})} \visible<4->{\times} \nonumber \\
& & \visible<4->{\textrm{P}(f_1=\textrm{} \mid \textrm{\{1-0\}},\phi,3,\textrm{'$t_0$ on the table'})} \visible<5->{\times} \nonumber \\
......@@ -2813,14 +2813,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{P}(s|t) & = & \sum_{a} \textrm{P}(s,a|t) \nonumber \\
\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \nonumber
\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \nonumber \\
\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \nonumber
\end{eqnarray}
\item \textbf{两个严重问题}
\begin{enumerate}
\item 第一个公式:如何遍历所有的对齐$a$
\item 第二个公式:如何计算$\textrm{P}(m|t)$$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$
\item 第一个公式:如何遍历所有的对齐$\mathbf{a}$
\item 第二个公式:如何计算$\textrm{P}(m|\mathbf{t})$$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$
\end{enumerate}
\item<2-> Brown等人(1993)的解决方法:对问题进行化简
......@@ -2848,21 +2848,21 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{enumerate}
\item 源语长度概率为常数$\epsilon$
\begin{displaymath}
\textrm{P}(m|t) \equiv \epsilon
\textrm{P}(m|\mathbf{t}) \equiv \epsilon
\end{displaymath}
\item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$仅依赖于译文长度$l+1$(均匀分布)
\item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$仅依赖于译文长度$l+1$(均匀分布)
\begin{displaymath}
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l+1}
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l+1}
\end{displaymath}
\item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$仅依赖与其对齐的译文单词$t_{a_j}$,即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
\item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$仅依赖与其对齐的译文单词$t_{a_j}$,即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
\begin{displaymath}
\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})
\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})
\end{displaymath}
\end{enumerate}
\item<2-> \textbf{核心思想是}把复杂参数化简为简单参数
\begin{itemize}
\item 比如:$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,t)$化简为$l$
\item 比如:$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$化简为$l$
\item \alert{优点}: 模型大大化简;\alert{缺点}:化简导致模型不准确
\end{itemize}
......@@ -2882,12 +2882,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
\node [anchor=west] (eq1part2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
\node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
......@@ -2906,13 +2906,13 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{tikzpicture}
\end{center}
\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\vspace{-1.0em}
\begin{eqnarray}
\textrm{P}(s|t) & = & \sum\limits_{a} \textrm{P}(s,a|t) \nonumber \\
& = & \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum\limits_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \nonumber \\
& = & \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\end{itemize}
......@@ -2925,14 +2925,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item $\textrm{P}(s|t) = \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐,即$\sum_{a}$。这个过程可以被重新表示为
\item $\textrm{P}(\mathbf{s}|\mathbf{t}) = \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐,即$\sum_{\mathbf{a}}$。这个过程可以被重新表示为
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west] (eq2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
\node [anchor=west,inner sep=0] (eq4) at (eq3.east) {...};
......@@ -2968,7 +2968,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{center}
\begin{enumerate}
\item<2-> 遍历所有的的对齐$a$$a$\{$a_1$,...,$a_m$\}组成,每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)
\item<2-> 遍历所有的的对齐$\mathbf{a}$$\mathbf{a}$\{$a_1$,...,$a_m$\}组成,每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)
\vspace{0.5em}
\begin{center}
......@@ -3003,7 +3003,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{center}
\vspace{0.5em}
\item<5-> 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$
\item<5-> 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
\end{enumerate}
\end{itemize}
......@@ -3026,28 +3026,28 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{scope}
\node [anchor=west] (s1) at (0,0) {$s$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$t$ = on\ \ the\ \ table};
\node [anchor=west] (s1) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$\mathbf{t}$ = on\ \ the\ \ table};
\draw [->,double,thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south);
\end{scope}
\begin{scope}[xshift=1.5in]
\node [anchor=west] (s2) at (0,0) {$s$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$t'$ = table \ on\ \ the};
\node [anchor=west] (s2) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ };
\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$\mathbf{t}'$ = table \ on\ \ the};
\draw [->,double,thick,ublue] ([yshift=0.2em]s2.south) -- ([yshift=-0.8em]s2.south);
\end{scope}
\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(s|t)$};
\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(s|t')$};
\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
\node [anchor=west] (comp1) at ([xshift=2.3em]score11.east) {\large{$\mathbf{=}$}};
\node [anchor=east] (label1) at ([xshift=-1em,yshift=0.1em]score11.west) {\textbf{IBM模型1:}};
\visible<2->{
\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(s|t)$};
\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(s|t')$};
\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
\node [anchor=west] (comp2) at ([xshift=2.3em]score21.east) {\large{$\mathbf{>}$}};
\node [anchor=east] (label2) at ([xshift=-1em,yshift=0.1em]score21.west) {\textbf{理想:}};
}
......@@ -3064,7 +3064,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv a(a_j|j,m,l)
\end{displaymath}
其它假设与IBM模型1相同,即$\textrm{P}(m|t) \equiv \epsilon$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})$
其它假设与IBM模型1相同,即$\textrm{P}(m|\mathbf{t}) \equiv \epsilon$$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$
\end{itemize}
......@@ -3083,12 +3083,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
\node [anchor=west] (eq1part2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
\node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
\node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
......@@ -3104,14 +3104,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\end{tikzpicture}
\end{center}
\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
\node [anchor=west] (eq2) at (eq1.east) {$=$};
\node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
\node [anchor=west,inner sep=0] (eq4) at ([xshift=-0.2em]eq3.east) {...};
......@@ -3142,8 +3142,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-0.5em}
\begin{enumerate}
\item 遍历所有的的对齐$a$
\item 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$,即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
\item 遍历所有的的对齐$\mathbf{a}$
\item 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$,即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
\end{enumerate}
\end{itemize}
......@@ -3158,8 +3158,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\begin{itemize}
......@@ -3227,17 +3227,17 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\vspace{-1.0em}
\begin{eqnarray}
\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
\end{eqnarray}
\begin{spacing}{1.2}
\begin{itemize}
\item 对于翻译模型$\textrm{P}(s|t)$,再来回顾一下统计机器翻译的三个基本问题
\item 对于翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$,再来回顾一下统计机器翻译的三个基本问题
\begin{enumerate}
\item \textbf{建模}:如何描述$\textrm{P}(s|t)$ \visible<2->{\alert{$\gets$ 已解!见上面两个公式}}
\item \textbf{解码}:给定模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$,如何利用上面的公式计算$\textrm{P}(s|t)$(语言模型计算暂不讨论),并找到最佳译文$\hat{t}$ \visible<2->{\alert{$\gets$ 下面讨论}}
\item \textbf{建模}:如何描述$\textrm{P}(\mathbf{s}|\mathbf{t})$ \visible<2->{\alert{$\gets$ 已解!见上面两个公式}}
\item \textbf{解码}:给定模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$,如何利用上面的公式计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(语言模型计算暂不讨论),并找到最佳译文$\hat{\mathbf{t}}$ \visible<2->{\alert{$\gets$ 下面讨论}}
\item \textbf{训练};如何从数据中自动学习模型参数$\epsilon$$a(a_j|j,m,l)$$f(s_j|t_{a_j})$ \visible<2->{\alert{$\gets$ 下面讨论}}
\end{enumerate}
......@@ -3281,7 +3281,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(s|t) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
\node [anchor=east] (modellabel) at ([yshift=0.1em]model.west) {\footnotesize{\textbf{问题的统计描述:}}};
\node [anchor=north west] (paras) at (model.south west) {\footnotesize{$\epsilon = ?;\ \ \forall a_j,j,m,l: a(a_j|j,m,l) = ?, f(s_j|t_{a_j}) = ?$}};
\node [anchor=east] (paraslabel) at ([yshift=0.1em]paras.west) {\footnotesize{\textbf{模型的参数:}}};
......@@ -3300,9 +3300,9 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tikzpicture}
\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$s$$t$,(高效地)计算$\textrm{P}(s|t)$(同时计算$\textrm{P}(t)$)}};
\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$\mathbf{s}$$\mathbf{t}$,(高效地)计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(同时计算$\textrm{P}(\mathbf{t})$)}};
\node [anchor=east] (scoringlabel) at ([yshift=0.1em]scoring.west) {\footnotesize{\textbf{模型得分计算:}}};
\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$t$,找到模型得分($\textrm{P}(s|t)\textrm{P}(t)$)最高}};
\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$\mathbf{t}$,找到模型得分($\textrm{P}(\mathbf{s}|\mathbf{t})\textrm{P}(\mathbf{t})$)最高}};
\node [anchor=north west] (searchpart2) at ([yshift=0.3em]search.south west) {\footnotesize{的译文输出}};
\node [anchor=east] (searchlabel) at ([yshift=0.1em]search.west) {\footnotesize{\textbf{搜索:}}};
......@@ -3317,7 +3317,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item \textbf{搜索(解码)问题}前面的实例已经描述了一种解法(见{\color{ublue} \hyperref[simpledecodingalgorithm]{\hspace{-0.2em}~\ref{simpledecodingalgorithm}}}):自左向右添加译文单词 + 剪枝技术。这里不再讨论,可以自行学习
\item \textbf{剩下的问题是}:对于任意的$s$$t$,如何\alert{高效地}计算$\textrm{P}(s|t)$
\item \textbf{剩下的问题是}:对于任意的$\mathbf{s}$$\mathbf{t}$,如何\alert{高效地}计算$\textrm{P}(\mathbf{s}|\mathbf{t})$
\end{itemize}
\end{itemize}
......@@ -3333,7 +3333,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\item $O((l+1)^m \cdot m)$ - IBM模型得分的直接计算几乎不可能!
\begin{displaymath}
\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
\end{displaymath}
\item<2-> $O(l \cdot m)$ - 实际上我们可以做的更好
......@@ -3355,7 +3355,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
}
\visible<3->{
\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} $};
\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} $};
\node [anchor=west,inner sep=2pt] (eq2part2) at ([xshift=-0.3em]eq2.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} f(s_j|t_i)$};
\node [anchor=east] (eq2label) at ([xshift=-0em,yshift=0.2em]eq2.west) {\small{IBM模型1:}};
......@@ -3363,7 +3363,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
}
\visible<4->{
\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(s|t) = \epsilon$};
\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon$};
\node [anchor=west,inner sep=2pt] (eq3part2) at ([xshift=-0.3em]eq3.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} a(i|j,m,l) f(s_j|t_i)$};
\node [anchor=east] (eq3label) at ([xshift=-0em,yshift=0.2em]eq3.west) {\small{类似的,IBM模型2:}};
}
......@@ -3525,7 +3525,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item<2-> \textbf{IBM模型的训练:对于给定的句对$(s,t)$,最大化翻译概率$\textrm{P}(s|t)$}。这里用符号$\textrm{P}_{\theta}(s|t)$表示概率由参数$\theta$决定
\item<2-> \textbf{IBM模型的训练:对于给定的句对$(\mathbf{s},\mathbf{t})$,最大化翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}。这里用符号$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$表示概率由参数$\theta$决定
\begin{center}
\begin{tikzpicture}
......@@ -3534,11 +3534,11 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\node [anchor=west] (eq2) at ([yshift=-0.2em]eq1.east) {=};
\node [anchor=west,inner sep=2pt] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
\node [anchor=north,inner sep=1pt] (eq3part2) at ([yshift=-0.2em]eq3.south) {\scriptsize{$\theta$}};
\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};
\visible<3->{
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.35em] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};
\node [anchor=north,draw,inner sep=3pt,fill=red!20] (eq3label) at ([yshift=-1.5em]eq3.south west) {\footnotesize{\textbf{求最优参数}}};
\node [anchor=north,draw,inner sep=3pt,fill=green!20] (eq4label) at ([yshift=-1.5em]eq4.south east) {\footnotesize{\textbf{目标函数}}};
......@@ -3561,23 +3561,23 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item $\textrm{P}(s|t)$可以被看做是$(s,t)$上的\alert{似然}函数($L(s,t;\theta)$)。所谓\alert{极大似然估计},就是要找到使$L(s,t;\theta)$达到最大的$\theta$
\item $\textrm{P}(\mathbf{s}|\mathbf{t})$可以被看做是$(\mathbf{s},\mathbf{t})$上的\alert{似然}函数($L(\mathbf{s},\mathbf{t};\theta)$)。所谓\alert{极大似然估计},就是要找到使$L(\mathbf{s},\mathbf{t};\theta)$达到最大的$\theta$
\vspace{-0.5em}
\begin{displaymath}
\{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(s,t;\theta)\}
\{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(\mathbf{s},\mathbf{t};\theta)\}
\end{displaymath}
\vspace{-0.3em}
$L(s,t;\theta)$表示$L(\cdot)$依赖模型参数$\theta$(注意分号),$\{\hat{\theta}\}$表示可能有多组结果,$\Theta$表示参数空间
$L(\mathbf{s},\mathbf{t};\theta)$表示$L(\cdot)$依赖模型参数$\theta$(注意分号),$\{\hat{\theta}\}$表示可能有多组结果,$\Theta$表示参数空间
\vspace{0.5em}
\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题:如何找到一组$\theta$使$\textrm{P}_{\theta}(s|t)$达到最大?\\
\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题:如何找到一组$\theta$使$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$达到最大?\\
\begin{itemize}
\item \textbf{求函数最大值问题}。比如,我们可以对$\textrm{P}_{\theta}(s|t)$求导,令导数为零,得到极值点
\item \textbf{求函数最大值问题}。比如,我们可以对$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$求导,令导数为零,得到极值点
\end{itemize}
\end{itemize}
......@@ -3641,7 +3641,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
%%%------------------------------------------------------------------------------------------------------------
%%% maximizing P(s|t)
\begin{frame}{最大化$\textrm{P}(s|t)$}
\begin{frame}{最大化$\textrm{P}(\mathbf{s}|\mathbf{t})$}
\begin{itemize}
......@@ -3710,7 +3710,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{itemize}
\item \textbf{含有约束的优化问题}: 不好解\\
\textbf{目标:} $\max(\textrm{P}_{\theta}(s|t))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$
\textbf{目标:} $\max(\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t}))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$
\vspace{0.3em}
\item<2-> \textbf{解决方法}: 含有约束优化 $\Rightarrow$ 不含约束优化\\
......@@ -3746,7 +3746,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
\begin{center}
\begin{tabular}{c | c}
\makebox[0.35\textwidth][c]{\textbf{原始问题}} & \makebox[0.35\textwidth][c]{\textbf{转化后的问题}} \\ \hline
$\max (\textrm{P}(s|t))$ & $\max (L(f,\lambda))$ \\
$\max (\textrm{P}(\mathbf{s}|\mathbf{t}))$ & $\max (L(f,\lambda))$ \\
s.t. $\forall t_y: \sum_{s_x} f(s_x|t_y) =1 $ & \\
\end{tabular}
\end{center}
......@@ -4022,11 +4022,11 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi
}
\visible<2->{
\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(s|t)$}}};
\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}}};
}
\visible<3->{
\node [anchor=south west,inner sep=2pt] (label2) at (eq5.north west) {\textbf{\scriptsize{配对的总次数}}};
\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(s,t)$}}};
\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$}}};
}
\visible<4->{
\node [anchor=south west,inner sep=2pt] (label3) at (eq6.north west) {\textbf{\scriptsize{有的$t_i$的相对值}}};
......@@ -4035,7 +4035,7 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi
\visible<2->{
\node [anchor=east,rotate=90] (neweq1) at ([yshift=-0em]eq4.south) {=};
\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(s|t)$}};
\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(\mathbf{s}|\mathbf{t})$}};
}
\visible<5->{
......@@ -4123,16 +4123,16 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
\end{center}
\begin{itemize}
\item<5-> \textbf{定义}:在$\textrm{P}(s|t)$中,$t_v$翻译(连接)到$s_u$的期望频次为
\item<5-> \textbf{定义}:在$\textrm{P}(\mathbf{s}|\mathbf{t})$中,$t_v$翻译(连接)到$s_u$的期望频次为
\vspace{-0.5em}
\begin{displaymath}
c_{\mathbb{E}}(s_u|t_v;s,t) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
\end{displaymath}
\vspace{-0.8em}
\item<6-> \textbf{重写$f(s_u|t_v)$}!!!
\begin{center}
\begin{tikzpicture}
\node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t)$}};
\node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$}};
\end{tikzpicture}
\end{center}
\end{itemize}
......@@ -4144,13 +4144,13 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
\begin{frame}{通过期望频次计算$f(s_u|t_v)$}
\begin{itemize}
\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(s|t)}$
\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(\mathbf{s}|\mathbf{t})}$
\vspace{-1.0em}
\begin{eqnarray}
f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber \\
& = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber
f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber \\
& = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber
\end{eqnarray}
\item<2-> \textbf{$\lambda_{t_v}^{'}$究竟是什么?} - 回忆一下IBM模型对$f(\cdot|\cdot)$的约束
......@@ -4163,7 +4163,7 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\vspace{-0.3em}
\begin{displaymath}
\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)
\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})
\end{displaymath}
\vspace{-0.6em}
......@@ -4173,8 +4173,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\begin{tikzpicture}
\node [anchor=west] (eq1) at (0,0) {$f(s_u|t_v) =$};
\draw [-] (eq1.east) -- ([xshift=8em]eq1.east);
\node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;s,t)$};
\node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)$};
\node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};
\node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};
\begin{pgfonlayer}{background}
\node[rectangle,draw,red,thick,inner sep=0] [fit = (eq1) (eq2) (eq3)] {};
......@@ -4190,9 +4190,9 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
%%% scale it up to the full corpus
\begin{frame}{在整个数据集上计算}
\begin{itemize}
\item \textbf{更真实的情况}:我们拥有一系列互译的句对(称作\alert{平行语料}),记为$\{(s^{[1]},t^{[1]}),(s^{[2]},t^{[2]}),...,(s^{[N]},t^{[N]})\}$。对于这$N$个训练用句对,定义$f(s_u|t_v)$的期望频次为
\item \textbf{更真实的情况}:我们拥有一系列互译的句对(称作\alert{平行语料}),记为$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),(\mathbf{s}^{[2]},\mathbf{t}^{[2]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$。对于这$N$个训练用句对,定义$f(s_u|t_v)$的期望频次为
\begin{displaymath}
c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})
c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})
\end{displaymath}
\item<2-> \textbf{于是}
\begin{center}
......@@ -4200,8 +4200,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\node [anchor=west,inner sep=2pt] (eq1) at (0,0) {$f(s_u|t_v)$};
\node [anchor=west] (eq2) at (eq1.east) {$=$\ };
\draw [-] ([xshift=0.3em]eq2.east) -- ([xshift=11.6em]eq2.east);
\node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
\node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
\node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};
\node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};
\visible<4->{
\node [anchor=south] (label1) at ([yshift=-6em,xshift=3em]eq1.north west) {利用这个公式计算};
......@@ -4250,17 +4250,17 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\label{ibmtraining}
\begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{IBM模型1的训练(EM算法)}
输入: 平行语料$\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$\\
输入: 平行语料$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$\\
输出:参数$f(\cdot|\cdot)$的最优值\\
1: \textbf{Function} \textsc{TrainItWithEM}($\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$) \\
1: \textbf{Function} \textsc{TrainItWithEM}($\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$) \\
2: \ \ Initialize $f(\cdot|\cdot)$ \hspace{5em} $\rhd$ 比如给$f(\cdot|\cdot)$一个均匀分布\\
3: \ \ Loop until $f(\cdot|\cdot)$ converges\\
4: \ \ \ \ \textbf{foreach} $k = 1$ to $N$ \textbf{do}\\
5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) = \sum\limits_{j=1}^{|s^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|t^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{t^{[1]},...,t^{[N]}\}$ \textbf{do}\\
7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$\\
8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{s^{[1]},...,s^{[N]}\}$ \textbf{do}\\
9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) = \sum\limits_{j=1}^{|\mathbf{s}^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|\mathbf{t}^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{\mathbf{t}^{[1]},...,\mathbf{t}^{[N]}\}$ \textbf{do}\\
7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})$\\
8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{\mathbf{s}^{[1]},...,\mathbf{s}^{[N]}\}$ \textbf{do}\\
9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
10: \ \textbf{return} $f(\cdot|\cdot)$
\end{beamerboxesrounded}
\vspace{-0.3em}
......@@ -4280,15 +4280,15 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
\end{itemize}
\end{itemize}
\begin{enumerate}
\item \textbf{E-Step} (对于句对$(s,t)$$m=|s|,l=|t|$)
\item \textbf{E-Step} (对于句对$(\mathbf{s},\mathbf{t})$$m=|\mathbf{s}|,l=|\mathbf{t}|$)
\begin{eqnarray}
c_{\mathbb{E}}(s_u|t_v;s,t) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
c_{\mathbb{E}}(i|j,m,l;s,t) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
c_{\mathbb{E}}(i|j,m,l;\mathbf{s},\mathbf{t}) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
\end{eqnarray}
\item \textbf{M-Step}
\begin{eqnarray}
f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})} \nonumber \\
a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})} \nonumber
f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \nonumber \\
a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \nonumber
\end{eqnarray}
\end{enumerate}
\end{frame}
......
......@@ -3706,8 +3706,8 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\subsection{基于chart的解码}
%%%------------------------------------------------------------------------------------------------------------
%%% CYK解码
\begin{frame}{CYK解码}
%%% CKY解码
\begin{frame}{CKY解码}
% 看NiuTrans Manual
\begin{itemize}
\item 基于层次短语的翻译解码与基于短语的模型类似,都是要找到使$\textrm{score}(d)$达到最大的翻译推导$d$
......@@ -3717,8 +3717,8 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\end{displaymath}
\vspace{-0.8em}
\begin{itemize}
\item 由于翻译推导由SCFG构成,使用CYK算法进行解码
\item CYK算法解码是一个用来判定任意给定的字符串 是否属于一个上下文无关文法的算法,具体流程如下
\item 由于翻译推导由SCFG构成,使用CKY算法进行解码
\item CKY算法解码是一个用来判定任意给定的字符串 是否属于一个上下文无关文法的算法,具体流程如下
\end{itemize}
\vspace{0.5em}
\begin{center}
......@@ -3740,16 +3740,16 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\end{tikzpicture}
\end{center}
\vspace{0.3em}
%\item 由于对文法中的非终结符进行了限制,可以直接使用CYK算法进行解码,无需转换成乔姆斯基范式
%\item 由于对文法中的非终结符进行了限制,可以直接使用CKY算法进行解码,无需转换成乔姆斯基范式
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% CYK解码
\begin{frame}{CYK算法}
%%% CKY解码
\begin{frame}{CKY算法}
% 看NiuTrans Manual
\begin{itemize}
\item CYK算法通过遍历不同\alert{span}来判断字符串是否符合文法
\item CKY算法通过遍历不同\alert{span}来判断字符串是否符合文法
\begin{itemize}
\item 输入:源语串\textbf{s =} $s_1 ... s_J$,以及CNF文法$G$
\item 输出:判断字符串是否符合G
......@@ -3762,7 +3762,7 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\tikzstyle{srcnode} = [anchor=south west]
\begin{scope}[scale=0.85]
\node[srcnode] (c1) at (0,0) {\small{\textbf{Function} CYK-Algorithm($\textbf{s},G$)}};
\node[srcnode] (c1) at (0,0) {\small{\textbf{Function} CKY-Algorithm($\textbf{s},G$)}};
\node[srcnode,anchor=north west] (c21) at ([xshift=1.5em,yshift=0.4em]c1.south west) {\small{\textbf{fore} $j=0$ to $ J - 1$}};
\node[srcnode,anchor=north west] (c22) at ([xshift=1.5em,yshift=0.4em]c21.south west) {\small{$span[j,j+1 ]$.Add($A \to a \in G$)}};
\node[srcnode,anchor=north west] (c3) at ([xshift=-1.5em,yshift=0.4em]c22.south west) {\small{\textbf{for} $l$ = 1 to $J$}};
......@@ -3810,11 +3810,11 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% CYK解码
\begin{frame}{CYK算法}
%%% CKY解码
\begin{frame}{CKY算法}
% 看NiuTrans Manual
\begin{itemize}
\item 我们来看一个CYK算法的具体例子,给定一个上下无关文法以及一个单词\alert{aabbc},来判断该单词是否属于此文法,解析流程如下
\item 我们来看一个CKY算法的具体例子,给定一个上下无关文法以及一个单词\alert{aabbc},来判断该单词是否属于此文法,解析流程如下
\vspace{-0.3em}
\begin{center}
\begin{tikzpicture}
......@@ -3946,11 +3946,11 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% CYK解码
\begin{frame}{CYK解码(续)}
%%% CKY解码
\begin{frame}{CKY解码(续)}
% 看NiuTrans Manual
\begin{itemize}
\item 实际上,在层次短语解码的时候,不能直接使用CYK算法,需要先转化为乔姆斯基范式,才能进行解码
\item 实际上,在层次短语解码的时候,不能直接使用CKY算法,需要先转化为乔姆斯基范式,才能进行解码
\begin{itemize}
\item<2-> 对于每个源语句子,使用短语规则表初始化它的span
\item<3-> 自底向上对span中的每个子span进行重新组合(正、反向)
......@@ -4166,7 +4166,7 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
% 实验结果
\begin{itemize}
\item 从实验结果中可以看出,基于层次短语的翻译模型性能要优于基于短语的翻译模型
\item 选择使用层次短语信息实际上增加了模型的复杂度,但是可以通过借鉴基于短语的翻译模型模型以及CYK解码和立方剪枝等技术来解决
\item 选择使用层次短语信息实际上增加了模型的复杂度,但是可以通过借鉴基于短语的翻译模型模型以及CKY解码和立方剪枝等技术来解决
\item 可以考虑加入更多句法信息来进一步提升模型性能
\end{itemize}
%\vspace{-1em}
......@@ -6785,7 +6785,7 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
搜索空间 & 与输入的源语句法树 & 所有推导$D$ \\
& 兼容的推导$D_{\textrm{tree}}$ & \\ \hline
适用模型 & 树到串、树到树 & 所有句法模型 \\ \hline
解码算法 & chart解码 & CYK + 规则二叉化 \\ \hline
解码算法 & chart解码 & CKY + 规则二叉化 \\ \hline
速度 && 一般较慢
\end{tabular}
......@@ -7358,7 +7358,7 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 基于串的解码 - CYK + 规则二叉化
%%% 基于串的解码 - CKY + 规则二叉化
\begin{frame}{基于串的解码 - CKY + 规则二叉化}
\begin{itemize}
......
......@@ -5031,6 +5031,10 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([xshift=1em]e1.east) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=west,inner sep=4pt] (sep5) at ([xshift=1em]e2.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([xshift=1em]sep5.east) {\scriptsize{$\textbf{e}_m$}};
\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
\node [anchor=south] (wordseq) at ([yshift=-1.5em]sep5.south) {\footnotesize{...}};
\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {island}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([xshift=-2em,yshift=1em]Lstm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([xshift=1em]t1.east) {\scriptsize{$\textbf{h}_2$}};
......@@ -5130,6 +5134,12 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
......@@ -5214,6 +5224,12 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
\node [anchor=south] (word2) at ([yshift=-1.7em]e2.south) {\footnotesize {[MASK]}};
\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
......
......@@ -520,12 +520,12 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[rnnnode,fill=blue!30!white,right=\base of rnn3] (rnn4) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn4] (emb4) {};
\node[wordnode,below=0pt of emb4] (word4) {EOS};
\node[wordnode,below=0pt of emb4] (word4) {$\langle$eos$\rangle$};
\draw[-latex'] (emb4.north) to (rnn4.south);
\draw[-latex'] (rnn3.east) to (rnn4.west);
}
\visible<4->{
\draw[decoration={mirror,brace},decorate] (word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
\draw[decoration={mirror,brace},decorate] ([yshift=-0.2em]word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
}
\visible<5->{
\node[rnnnode,fill=purple] (repr) at (rnn4) {};
......@@ -535,7 +535,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<6->{
\node[rnnnode,fill=blue!30!white,right=\base of rnn4] (rnn5) {};
\node[rnnnode,fill=green!30!white,below=\base of rnn5] (emb5) {};
\node[wordnode,below=0pt of emb5] (word5) {SOS};
\node[wordnode,below=0pt of emb5] (word5) {$\langle$sos$\rangle$};
\draw[-latex'] (emb5.north) to (rnn5.south);
\draw[-latex'] (rnn4.east) to (rnn5.west);
\node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax1) {};
......@@ -578,7 +578,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,anchor=base] (word8) at (\XCoord,\YCoord) {fine};
\ExtractX{$(emb8)$}
\ExtractY{$(out1.base)$}
\node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {EOS};
\node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
\draw[-latex'] (emb8.north) to (rnn8.south);
\draw[-latex'] (rnn7.east) to (rnn8.west);
\draw[-latex'] (rnn8.north) to (softmax4.south);
......@@ -720,7 +720,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-1em]node12.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-1em]node13.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-1em]node14.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$sos$>$}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
\node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
......@@ -1072,10 +1072,10 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
\visible<2->{
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_j$}};
\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常,用Softmax函数}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_j|...)$}};
}
\visible<3->{
......@@ -1833,7 +1833,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};
% Decoder input words
\node[wordnode,below=0pt of demb1] (decwordin) {EOS};
\node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
\ExtractX{$(demb2.south)$}
\ExtractY{$(decwordin.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
......@@ -1890,7 +1890,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
\ExtractX{$(softmax10.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
% Connections
\draw[-latex'] (init.east) to (enc1.west);
......@@ -1971,7 +1971,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,below=0pt of eemb7] () {怎么};
\node[wordnode,below=0pt of eemb8] () {};
\node[wordnode,below=0pt of eemb9] () {};
\node[wordnode,below=0pt of eemb10] () {EOS};
\node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};
% RNN Decoder
\foreach \x in {1,2,...,10}
......@@ -2041,7 +2041,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
\ExtractX{$(softmax10.north)$}
\ExtractY{$(decwordout.base)$}
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
\node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
% Connections
\draw[-latex'] (init1.east) to (enc11.west);
......@@ -2187,7 +2187,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\begin{itemize}
\item 在注意力机制中,每个目标语单词的生成会使用一个动态的源语表示,而非一个统一的固定表示
\begin{itemize}
\item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
\item 这里$\textbf{C}_j$表示第$j$个目标语单词所使用的源语表示
\end{itemize}
\end{itemize}
......@@ -2286,15 +2286,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
%%%------------------------------------------------------------------------------------------------------------
%%% C_i的定义
\begin{frame}{上下文向量$\textbf{C}_i$}
\begin{frame}{上下文向量$\textbf{C}_j$}
\begin{itemize}
\item 对于目标语位置$i$$\textbf{C}_i$是目标语$i$使用的上下文向量
\item 对于目标语位置$j$$\textbf{C}_j$是目标语$j$使用的上下文向量
\begin{itemize}
\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
\item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
\item<2-> $a(\cdot)$表示注意力函数,计算$\textbf{s}_{i-1}$$\textbf{h}_j$之间的相关性
\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和,权重为$\{\alpha_{i,j}\}$
\item $\textbf{h}_i$表示编码器第$i$个位置的隐层状态
\item $\textbf{s}_j$表示解码器第$j$个位置的隐层状态
\item<2-> $\alpha_{i,j}$表示注意力权重,表示目标语第$j$个位置与源语第$i$个位置之间的相关性大小
\item<2-> $a(\cdot)$表示注意力函数,计算$\textbf{s}_{j-1}$$\textbf{h}_i$之间的相关性
\item<3-> $\textbf{C}_j$是所有源语编码表示$\{\textbf{h}_i\}$的加权求和,权重为$\{\alpha_{i,j}\}$
\end{itemize}
\end{itemize}
......@@ -2306,23 +2306,23 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_m$}};
\node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
\draw [thick,-,ublue] (sum.north) -- (sum.south);
\draw [thick,-,ublue] (sum.west) -- (sum.east);
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{j-1}$}};
\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{j}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) .. (sum.-10) node [pos=0.1,left] (alphan) {\tiny{$\alpha_{i,n}$}};
\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) .. (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{1,j}$}};
\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) .. (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{2,j}$}};
\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) .. (sum.-10) node [pos=0.1,left] (alphan) {\tiny{$\alpha_{m,j}$}};
\draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
\draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
\draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) .. ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{j}$}};
\node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
\draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
......@@ -2334,11 +2334,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};
\visible<2->{
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_j = \sum_{i} \alpha_{i,j} \textbf{h}_i \ \ $};
}
\visible<3->{
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{i'} \exp(\beta_{i',j})}$};
\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{j-1}, \textbf{h}_i)$};
}
\begin{pgfonlayer}{background}
......@@ -2418,7 +2418,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
% target
\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{}};
......@@ -2428,7 +2428,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{}};
\node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};
\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{$\langle$eos$\rangle$}};
\end{scope}
......@@ -2464,7 +2464,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设,端到端学习 \\
\visible<3->{
% coverage score formula node
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_j$所对应的源语言词的权重是不同的}};
\node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
\node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{`` 没''}) + ..$}};
}
......@@ -2526,7 +2526,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化:\\
\vspace{-0.3em}
\begin{displaymath}
\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{j-1}, \textbf{h}_i))}{\sum_{i'} \exp(a(\textbf{s}_{j-1}, \textbf{h}_{i'}))}
\end{displaymath}
\item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$$\textbf{h}$之间的\alert{相似性},这也可以被看作是目标语表示和源语言表示的一种``统一化'',即把源语言和目标语表示在同一个语义空间,进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式:}
......@@ -2572,7 +2572,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
ymin=-0.5,ymax=5.5,
xmin=-0.5,xmax=2.5,
ytick={0,1,...,5},
yticklabels={The,New,York,Times,comments,EOS},
yticklabels={The,New,York,Times,comments,$\langle$eos$\rangle$},
yticklabel style={font=\scriptsize},
xtick={0,1,2},
xticklabels={纽约时报,发表,评论},
......@@ -2593,7 +2593,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
ymin=-0.5,ymax=5.5,
xmin=-0.5,xmax=3.5,
ytick={0,1,...,5},
yticklabels={I,came,to,this,world,EOS},
yticklabels={I,came,to,this,world,$\langle$eos$\rangle$},
yticklabel style={font=\scriptsize},
xtick={0,1,2,3},
xticklabels={我,来到,这个,世界},
......@@ -2715,7 +2715,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
%%% 如何进一步理解注意力机制 - 回到机器翻译任务
\begin{frame}{重新解释注意力机制(续)}
\begin{itemize}
\item 回到机器翻译,如果把目标语状态$\textbf{s}_{i-1}$看做query,而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}
\item 回到机器翻译,如果把目标语状态$\textbf{s}_{j-1}$看做query,而把源语言所有位置的最上层RNN表示$\textbf{h}_{i}$看做{\color{ugreen} \textbf{key}}{\color{red} \textbf{value}}
\end{itemize}
\vspace{-1.5em}
......@@ -3084,7 +3084,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
% step 6
\visible<6->{
\node[rnnnode] (rnn34) at ([xshift=2\base]rnn33) {};
\node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {EOS};
\node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {$\langle$eos$\rangle$};
\draw[-latex'] (rnn33) to (rnn34);
\draw[-latex'] (rnn24) to (rnn34);
\draw[-latex'] (rnn34) to (o4);
......@@ -3136,7 +3136,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
\end{displaymath}
\item<2-> 由于$y_i$的生成需要依赖$y_{i-1}$,因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成
\item<2-> 由于$y_j$的生成需要依赖$y_{j-1}$,因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成
\end{itemize}
......@@ -3156,7 +3156,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.8em]e2.south) {\tiny{...}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
\draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
......@@ -3202,7 +3202,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
}
\visible<4->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
}
\visible<7->{
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{Have}};
......@@ -3355,7 +3355,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
}
\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{$\langle$sos$\rangle$}};
\visible<6->{
\node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
......@@ -3546,7 +3546,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
% words
\node[wnode,below=0pt of encemb1] (encword1) {};
\node[wnode,below=0pt of encemb2] (encword2) {什么};
\node[wnode,below=0pt of encemb4] (encword4) {EOS};
\node[wnode,below=0pt of encemb4] (encword4) {$\langle$eos$\rangle$};
% connections
\draw[-latex'] (enc11) to (enc12);
......@@ -3645,7 +3645,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node[rnnnode,fill=blue!20,above=\base of dec54] (softmax4) {};
% words
\node[wnode,below=0pt of decemb1] (decinword1) {SOS};
\node[wnode,below=0pt of decemb1] (decinword1) {$\langle$sos$\rangle$};
\node[wnode,below=0pt of decemb2] (decinword2) {Have};
\node[wnode,below=0pt of decemb4] (decinword4) {?};
......@@ -3655,7 +3655,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node[wnode,anchor=base] (decoutword2) at (\XCoord,\YCoord) {you};
\ExtractX{$(softmax4.north)$}
\ExtractY{$(decoutword1.base)$}
\node[wnode,anchor=base] (decoutword4) at (\XCoord,\YCoord) {EOS};
\node[wnode,anchor=base] (decoutword4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
% connections
\draw[-latex'] (dec11) to (dec12);
......@@ -3810,7 +3810,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$\langle$sos$\rangle$}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
\node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
......@@ -4100,9 +4100,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I am fine}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I am fine}$}};
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$EOS$>$ }$}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$eos$>$ }$}};
\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
......@@ -4127,6 +4127,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
\node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};
\node [ugreen,font=\scriptsize] (count) at ([xshift=-1.5em,yshift=-1em]encoder.south) {$6\times$};
\node [red,font=\scriptsize] (count) at ([xshift=10.8em,yshift=0em]decoder.south) {$\times 6$};
\end{scope}
\end{tikzpicture}
\end{center}
......@@ -4180,9 +4183,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I am fine}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I am fine}$}};
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$EOS$>$ }$}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$eos$>$ }$}};
\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
......@@ -4414,9 +4417,9 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I am fine}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I am fine}$}};
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$EOS$>$ }$}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$eos$>$ }$}};
\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
......@@ -4591,7 +4594,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
% target
\node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
......@@ -4599,7 +4602,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};
\node [rounded corners=0.3em,fill=yellow!30] (qk) at ([xshift=2.5em,yshift=5em]a55.north) {\large{$\frac{QK^{T}}{\sqrt{d_k}}$}};
\node [rounded corners=0.3em,anchor=west] (add) at ([xshift=0.1em]qk.east) {\large{+}};
......@@ -4630,7 +4633,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
\node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
\node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};
% target
\node[tgtnode] (tgt1) at (5.4*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
......@@ -4638,7 +4641,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
\node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
\node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};
\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};
\node [rounded corners=0.3em,anchor=west,fill=green!30] (softmax) at ([xshift=-6em]left.east) {\large{Softmax}};
......@@ -4800,9 +4803,9 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I am fine}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I am fine}$}};
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$EOS$>$ }$}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$eos$>$ }$}};
\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
......@@ -5030,9 +5033,9 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
\node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
\node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I am fine}$}};
\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I am fine}$}};
\node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$EOS$>$ }$}};
\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I am fine $<$eos$>$ }$}};
\draw [->] (sa2.north) -- (res3.south);
\draw [->] (res3.north) -- (ed1.south);
......@@ -5170,7 +5173,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};
......@@ -5212,7 +5215,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\node [anchor=south,fill=black!5!white,minimum height=1.1em,minimum width=13em,inner sep=2pt,rounded corners=1pt,draw] (loss) at ([xshift=1.8em,yshift=1em]o2.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
}
\visible<3->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
\node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
\node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
......@@ -5413,7 +5416,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
%\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
%\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};
......@@ -5473,7 +5476,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
%\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
}
\visible<4->{
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
}
\visible<6->{
\node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
......@@ -5497,7 +5500,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\visible<8->{
\node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{you}};
\node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
\node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{EOS}};
\node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{$\langle$eos$\rangle$}};
\node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
}
......@@ -5606,7 +5609,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
\node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
\node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$sos$>$}};
\node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{}};
\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
\node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论