合并分支 'caorunzhe' 到 'master'

Caorunzhe 查看合并请求 !273

合并分支 'caorunzhe' 到 'master'
Caorunzhe 查看合并请求 !273
e10537de · 曹润柘 · 35ddc010 · b64ef0db · e10537de · e10537de
Commit e10537de authored May 27, 2020 by 曹润柘
--- a/Book/Chapter6/Figures/figure-example-of-context-vector-calculation-process.tex
+++ b/Book/Chapter6/Figures/figure-example-of-context-vector-calculation-process.tex
@@ -104,7 +104,7 @@
 %\visible<3->
 {
 % coverage score formula node
-\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_j$所对应的源语言词的权重是不同的}};
 \node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
 \node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{`` 没''}) + ..$}};
 }

--- a/Section01-Introduction/section01.tex
+++ b/Section01-Introduction/section01.tex
@@ -304,8 +304,8 @@
 \visible<3->{
 \begin{center}
 \begin{tikzpicture}
-\node [anchor=south west, fill=red, minimum width=1.5cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
-\node [anchor=south west, fill=ugreen, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{人}}};
+\node [anchor=south west, fill=red!50, minimum width=1.5cm, minimum height=2.3cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
+\node [anchor=south west, fill=blue!50, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{人}}};
 \node [anchor=south] (mtscore) at (mt.north) {3.9};
 \node [anchor=south] (humanscore) at (human.north) {4.7};
 \draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=0.5cm]human.south east);
@@ -321,8 +321,8 @@
 \visible<4->{
 \begin{center}
 \begin{tikzpicture}
-\node [anchor=south west, fill=red, minimum width=1.5cm, minimum height=1.5cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
-\node [anchor=south west, fill=ugreen, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{人}}};
+\node [anchor=south west, fill=red!50, minimum width=1.5cm, minimum height=1.5cm] (mt) at (1,0) {{\color{white} \textbf{机器}}};
+\node [anchor=south west, fill=blue!50, minimum width=1.5cm, minimum height=2.7cm] (human) at ([xshift=0.5cm]mt.south east) {{\color{white} \textbf{人}}};
 \node [anchor=south] (mtscore) at (mt.north) {47\%};
 \node [anchor=south] (humanscore) at (human.north) {100\%};
 \draw [->,thick] ([xshift=-0.5cm]mt.south west) -- ([xshift=0.5cm]human.south east);

--- a/Section03-Word-Based-Models/section03.tex
+++ b/Section03-Word-Based-Models/section03.tex
@@ -775,7 +775,7 @@
 \end{pgfonlayer}
 }

-\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
+\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
 \node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};

 \begin{pgfonlayer}{background}
@@ -808,18 +808,18 @@
 \end{tikzpicture}

 \item<3-> \textbf{步骤1}：构建单词翻译表 - 翻译词典\\
-          \small{对于任意的源语言单词$x$，要获得它所有可能的译文$Y$。给定一个互译句对$(s,t)$，对于$y \in Y$，定义$\textrm{P}(x \leftrightarrow y; s, t)$表示$x$和$y$在$(x,y)$中互译的概率，我们用$x$和$y$的联合概率表示：
+          \small{对于任意的源语言单词$x$，要获得它所有可能的译文$Y$。给定一个互译句对$(\mathbf{s},\mathbf{t})$，对于$y \in Y$，定义$\textrm{P}(x \leftrightarrow y; \mathbf{s}, \mathbf{t})$表示$x$和$y$在$(x,y)$中互译的概率，我们用$x$和$y$的联合概率表示：

          \vspace{-2.0em}

          \begin{eqnarray}
-          \textrm{P}(x \leftrightarrow y; s,t) & \equiv & \textrm{P}(x,y;s,t) \nonumber \\
-                                          & = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
+          \textrm{P}(x \leftrightarrow y; \mathbf{s},\mathbf{t}) & \equiv & \textrm{P}(x,y;\mathbf{s},\mathbf{t}) \nonumber \\
+                                          & = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
          \end{eqnarray}

          \vspace{-0.5em}

-          $c(x,y;s,t)$表示$(x,y)$在$(s,t)$中共现的次数; $\sum_{x',y'} c(x',y';s,t)$表示$(s,t)$中任意源/译文单词共现的总次数
+          $c(x,y;\mathbf{s},\mathbf{t})$表示$(x,y)$在$(\mathbf{s},\mathbf{t})$中共现的次数; $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})$表示$(\mathbf{s},\mathbf{t})$中任意源/译文单词共现的总次数
          }

 \end{itemize}
@@ -831,7 +831,7 @@
 \begin{frame}{实现一个简单的机器翻译系统：学习单词翻译概率(2)}
 \vspace{-1em}
 \begin{eqnarray}
-\textrm{P}(x,y;s,t) & = & \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)} \nonumber
+\textrm{P}(x,y;\mathbf{s},\mathbf{t}) & = & \frac{c(x,y;\mathbf{s},\mathbf{t})}{\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})} \nonumber
 \end{eqnarray}

 \vspace{-0.5em}
@@ -839,7 +839,7 @@
 \begin{flushleft}
 \begin{tikzpicture}

-\node [anchor=west] (s) at (0,0) {$s=$};
+\node [anchor=west] (s) at (0,0) {$\mathbf{s}=$};
 \node [anchor=center] (sw1) at ([xshift=1em]s.east) {机器};
 \visible<1,4->{
 \node [anchor=center] (sw2) at ([xshift=1.3em]sw1.east) {翻译};
@@ -849,7 +849,7 @@
 \node [anchor=center] (sw4) at ([xshift=1.0em]sw3.east) {翻译};
 }

-\node [anchor=north west] (t) at (s.south west) {$t=$};
+\node [anchor=north west] (t) at (s.south west) {$\mathbf{t}=$};
 \node [anchor=center] (tw1) at ([xshift=1.8em]t.east) {machine};
 \visible<1,3,5->{
 \node [anchor=center] (tw2) at ([xshift=2.2em]tw1.east) {translation};
@@ -880,8 +880,8 @@

 \begin{itemize}

-\item $c(\textrm{'翻译'},\textrm{'translation'};s,t)=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
-\item<6-> $\sum_{x',y'} c(x',y';s,t)= \textrm{使劲数...}  = 63\visible<7->{ = 9 \times 7 = |s| \times |t|}$
+\item $c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t})=\only<1>{?}\visible<2->{1}\visible<3->{+1}\visible<4->{+1}\visible<5->{+1=4}$
+\item<6-> $\sum_{x',y'} c(x',y';\mathbf{s},\mathbf{t})= \textrm{使劲数...}  = 63\visible<7->{ = 9 \times 7 = |\mathbf{s}| \times |\mathbf{t}|}$
    \vspace{0.3em}
    \begin{itemize}
    \item<7-> $|\cdot|$表示句子长度
@@ -889,14 +889,14 @@
    \vspace{0.3em}
 \item<8-> '翻译'和'translation'的互译概率为
 \begin{displaymath}
-\textrm{P}(\textrm{'翻译'},\textrm{'translation'};s,t) = 4/63
+\textrm{P}(\textrm{'翻译'},\textrm{'translation'};\mathbf{s},\mathbf{t}) = 4/63
 \end{displaymath}
 \vspace{-0.5em}
 类似的
 \vspace{-0.5em}
 \begin{eqnarray}
-\textrm{P}(\textrm{'机器'},\textrm{'translation'};s,t) & = & 2/63 \nonumber \\
-\textrm{P}(\textrm{'机器'},\textrm{'look'};s,t) & = & 0/63 \nonumber
+\textrm{P}(\textrm{'机器'},\textrm{'translation'};\mathbf{s},\mathbf{t}) & = & 2/63 \nonumber \\
+\textrm{P}(\textrm{'机器'},\textrm{'look'};\mathbf{s},\mathbf{t}) & = & 0/63 \nonumber
 \end{eqnarray}

 \end{itemize}
@@ -909,12 +909,12 @@

 \begin{itemize}

-\item 很多时候，我们有多个互译句对$(s^{[1]},t^{[1]}),...,(s^{[n]},t^{[n]})$，称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为
+\item 很多时候，我们有多个互译句对$(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[n]},\mathbf{t}^{[n]})$，称之为\alert{双语平行数据(语料)}。翻译概率可以被定义为

 \vspace{-1em}

 \begin{eqnarray}
-\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;s^{[i]},t^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';s^{[i]},t^{[i]})} \nonumber
+\textrm{P}(x,y) & = & \frac{\sum_{i=1}^{n}c(x,y;\mathbf{s}^{[i]},\mathbf{t}^{[i]})}{\sum_{i=1}^{n} \sum_{x',y'} c(x',y';\mathbf{s}^{[i]},\mathbf{t}^{[i]})} \nonumber
 \end{eqnarray}

 \item<2-> 说白了就是计算$(x,y)$的频次时，在每个句子上累加
@@ -922,11 +922,11 @@
 \begin{flushleft}
 \begin{tikzpicture}

-\node [anchor=west] (s1) at (0,0) {$s_1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
-\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$t_1=$ Machine translation is just translation by computer};
+\node [anchor=west] (s1) at (0,0) {$\mathbf{s}^1=$ 机器 翻译 就 是 用 计算机 进行 翻译};
+\node [anchor=north west] (t1) at ([yshift=0.4em]s1.south west) {$\mathbf{t}^1=$ Machine translation is just translation by computer};

-\node [anchor=north west] (s2) at (t1.south west) {$s_2=$ 那 人工 翻译 呢 ?};
-\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$t_2=$ So , what is human translation ?};
+\node [anchor=north west] (s2) at (t1.south west) {$\mathbf{s}^2=$ 那 人工 翻译 呢 ?};
+\node [anchor=north west] (t2) at ([yshift=0.4em]s2.south west) {$\mathbf{t}^2=$ So , what is human translation ?};

 \end{tikzpicture}
 \end{flushleft}
@@ -936,8 +936,8 @@
 {\footnotesize
 \begin{eqnarray}
 &   & \textrm{P}(\textrm{'翻译'},\textrm{'translation'}) \nonumber \\
-& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};s^{[1]},t^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};s^{[2]},t^{[2]})}{\sum_{x',y'} c(x',y';s^{[1]},t^{[1]}) + \sum_{x',y'} c(x',y';s^{[2]},t^{[2]})} \nonumber \\
-\visible<3->{& = & \frac{4 + 1}{|s^{[1]}| \times |t^{[1]}| + |s^{[2]}| \times |t^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
+& = & \frac{c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[1]},\mathbf{t}^{[1]})+c(\textrm{'翻译'},\textrm{'translation'};\mathbf{s}^{[2]},\mathbf{t}^{[2]})}{\sum_{x',y'} c(x',y';\mathbf{s}^{[1]},\mathbf{t}^{[1]}) + \sum_{x',y'} c(x',y';\mathbf{s}^{[2]},\mathbf{t}^{[2]})} \nonumber \\
+\visible<3->{& = & \frac{4 + 1}{|\mathbf{s}^{[1]}| \times |\mathbf{t}^{[1]}| + |\mathbf{s}^{[2]}| \times |\mathbf{t}^{[2]}|} = \frac{4 + 1}{9 \times 7 + 5 \times 7} = \frac{5}{98}} \nonumber
 \end{eqnarray}
 }

@@ -964,7 +964,7 @@
 \end{pgfonlayer}
 }

-\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
+\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
 \node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};

 \begin{pgfonlayer}{background}
@@ -992,14 +992,14 @@
 \end{center}

 \begin{itemize}
-\item \textbf{步骤2}: 对任意的句对$(s,t)$计算句子级翻译概率$\textrm{P}(t|s)$ \\
+\item \textbf{步骤2}: 对任意的句对$(\mathbf{s},\mathbf{t})$计算句子级翻译概率$\textrm{P}(\mathbf{t}|\mathbf{s})$ \\

      \vspace{0.5em}
      \visible<2->{
-      用一种比较简单的思路：定义$(s,t)$上的一种分数$g(s,t)$
+      用一种比较简单的思路：定义$(\mathbf{s},\mathbf{t})$上的一种分数$g(\mathbf{s},\mathbf{t})$
      \begin{itemize}
-      \item $g(s,t)$的值越大翻译质量越好
-      \item $g(s,t)$的值越小翻译质量越差
+      \item $g(\mathbf{s},\mathbf{t})$的值越大翻译质量越好
+      \item $g(\mathbf{s},\mathbf{t})$的值越小翻译质量越差
      \end{itemize}
      }

@@ -1009,10 +1009,10 @@
      于是，我们进一步定义

      \begin{displaymath}
-      \textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}
+      \textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}
      \end{displaymath}

-      实际上就是对$g(s,t)$在所有可能的译文集合上做归一化，使其具有概率意义
+      实际上就是对$g(\mathbf{s},\mathbf{t})$在所有可能的译文集合上做归一化，使其具有概率意义
      }

 \end{itemize}
@@ -1030,11 +1030,11 @@

 \item 两个问题
    \begin{enumerate}
-    \item 如何计算$g(s,t)$ \visible<2->{- 最关键的建模问题，马上开始}
-    \item 如何计算$\sum_{t'} g(s,t')$ \visible<2->{- 实际上\alert{不用计算}，后面再说}
+    \item 如何计算$g(\mathbf{s},\mathbf{t})$ \visible<2->{- 最关键的建模问题，马上开始}
+    \item 如何计算$\sum_{\mathbf{t}'} g(\mathbf{s},\mathbf{t}')$ \visible<2->{- 实际上\alert{不用计算}，后面再说}
    \end{enumerate}

-\item<3-> \textbf{对$g(s,t)$建模: }根据本章第一页的假设，$s$与$t$之间存在一种单词间的对应，我们称之为\alert{词对齐}关系
+\item<3-> \textbf{对$g(\mathbf{s},\mathbf{t})$建模: }根据本章第一页的假设，$\mathbf{s}$与$\mathbf{t}$之间存在一种单词间的对应，我们称之为\alert{词对齐}关系

 \begin{center}
 \begin{tikzpicture}
@@ -1045,7 +1045,7 @@
 \node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {你\footnotesize{$_3$}};
 \node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
 \node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
-\node [anchor=east] (s) at (s1.west) {$s=$};
+\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
 \end{scope}

 \begin{scope}[yshift=-3.0em]
@@ -1054,7 +1054,7 @@
 \node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
 \node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
 \node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
-\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
+\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
 \end{scope}


@@ -1082,9 +1082,9 @@
 \begin{frame}{实现一个简单的机器翻译系统：句子级翻译模型(3)}

 \begin{itemize}
-\item 给定一个句对$(s,t)$，及它们之间的(最优)词对齐$\hat{A}$，可以定义模型得分为：
+\item 给定一个句对$(\mathbf{s},\mathbf{t})$，及它们之间的(最优)词对齐$\hat{A}$，可以定义模型得分为：
 \begin{displaymath}
-g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
+g(\mathbf{s},\mathbf{t}) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
 \end{displaymath}
      显然每个单词翻译概率都高，那么整句的模型得分也高

@@ -1097,7 +1097,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
 \node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {你\footnotesize{$_3$}};
 \node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
 \node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
-\node [anchor=east] (s) at (s1.west) {$s=$};
+\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
 \end{scope}

 \begin{scope}[yshift=-3.0em]
@@ -1106,7 +1106,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
 \node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
 \node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
 \node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
-\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t=$};
+\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}=$};
 \end{scope}


@@ -1122,7 +1122,7 @@ g(s,t) \equiv \prod_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)
 \vspace{-2.5em}

 \begin{eqnarray}
-g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
+g(\mathbf{s},\mathbf{t}) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'}) \times \textrm{P}(\textrm{'你','you'}) \times \nonumber \\
       &   & \textrm{P}(\textrm{'感到','am'}) \times \textrm{P}(\textrm{'满意','satisfied'}) \nonumber
 \end{eqnarray}

@@ -1141,13 +1141,13 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
 \begin{frame}{实现一个简单的机器翻译系统：句子级翻译模型(4)}

 \begin{itemize}
-\item \textbf{但是}，这样设计的$g(s,t)$没有考虑词序的信息。相同译词出现在不同的位置，得分相同 - 无法选择流畅的译文
+\item \textbf{但是}，这样设计的$g(\mathbf{s},\mathbf{t})$没有考虑词序的信息。相同译词出现在不同的位置，得分相同 - 无法选择流畅的译文

 \vspace{0.5em}

 \begin{tabular}{l | l }

-& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(t)$}}} \\ \hline
+& \footnotesize{$\prod\limits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$} \visible<2->{\alert{\footnotesize{$\times\textrm{P}_{lm}(\mathbf{t})$}}} \\ \hline

 \begin{tikzpicture}

@@ -1160,7 +1160,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
 \node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {你\footnotesize{$_3$}};
 \node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
 \node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
-\node [anchor=east] (s) at (s1.west) {$s=$};
+\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
 \end{scope}

 \begin{scope}[yshift=-2.6em]
@@ -1169,7 +1169,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
 \node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied\footnotesize{$_3$}};
 \node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with\footnotesize{$_4$}};
 \node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you\footnotesize{$_5$}};
-\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$t'=$};
+\node [anchor=east] (t) at ([xshift=-0.3em]t1.west) {$\mathbf{t}'=$};
 \end{scope}


@@ -1197,7 +1197,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
 \node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {你\footnotesize{$_3$}};
 \node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到\footnotesize{$_4$}};
 \node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意\footnotesize{$_5$}};
-\node [anchor=east] (s) at (s1.west) {$s=$};
+\node [anchor=east] (s) at (s1.west) {$\mathbf{s}=$};
 \end{scope}

 \begin{scope}[yshift=-2.6em]
@@ -1206,7 +1206,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'
 \node [anchor=center] (t3) at ([yshift=-1.7em]s3.south) {you\footnotesize{$_3$}};
 \node [anchor=center] (t4) at ([yshift=-1.7em]s4.south) {am\footnotesize{$_4$}};
 \node [anchor=center] (t5) at ([yshift=-1.6em]s5.south) {satisfied\footnotesize{$_5$}};
-\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$t''=$};
+\node [anchor=center] (t) at ([xshift=-1.3em]t1.west) {$\mathbf{t}''=$};
 \end{scope}


@@ -1225,7 +1225,7 @@ g(s,t) & = & \textrm{P}(\textrm{'我','I'}) \times \textrm{P}(\textrm{'对','with'

 \end{tabular}

-\item<2-> \textbf{解决方案}：引入语言模型$\textrm{P}_{lm}(t)$来度量译文的流畅度
+\item<2-> \textbf{解决方案}：引入语言模型$\textrm{P}_{lm}(\mathbf{t})$来度量译文的流畅度

 $\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 | w_1) \times \textrm{P}(w_3 | w_2) ... \times \textrm{P}(w_m | w_{m-1})$

@@ -1234,7 +1234,7 @@ $\textrm{P}_{\textrm{2-gram}}(w_1...w_m)=\textrm{P}(w_1) \times \textrm{P}(w_2 |
 \vspace{-1em}

 \begin{displaymath}
-g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(t)
+g(\mathbf{s},\mathbf{t})=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_{lm}(\mathbf{t})
 \end{displaymath}

 \end{itemize}
@@ -1260,7 +1260,7 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
 \end{pgfonlayer}
 }

-\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($t|s$)};
+\node [anchor=west,ugreen] (P) at ([xshift=4em,yshift=-0.7em]corpus.east){P($\mathbf{t}|\mathbf{s}$)};
 \node [anchor=south] (modellabel) at (P.north) {{\color{ublue} {\scriptsize \textbf{翻译模型}}}};

 \begin{pgfonlayer}{background}
@@ -1287,16 +1287,16 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
 \end{center}

 \begin{itemize}
-\item \textbf{步骤3：解码 - }对任意的$s$，找到翻译概率最大的译文$\hat{t}$
+\item \textbf{步骤3：解码 - }对任意的$\mathbf{s}$，找到翻译概率最大的译文$\hat{\mathbf{t}}$
 \begin{displaymath}
-\hat{t} = \argmax_{t} \textrm{P}(t|s)
+\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
 \end{displaymath}

-这里$\argmax_{a} f(a)$表示找到使$f(a)$达到最大的$a$输出
+这里$\argmax_{\mathbf{a}} f(\mathbf{a})$表示找到使$f(\mathbf{a})$达到最大的$\mathbf{a}$输出

-\item<2-> 现在我们可以对任意的$(s,t)$计算$\textrm{P}(t|s) = \frac{g(s,t)}{\sum_{t'}g(s,t')}$
+\item<2-> 现在我们可以对任意的$(\mathbf{s},\mathbf{t})$计算$\textrm{P}(\mathbf{t}|\mathbf{s}) = \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')}$
      \begin{itemize}
-      \item 给定$s$，$\sum_{t'}g(s,t')$是个常数(因为$\sum_{t'}g(s,t')$的变量只有$s$)
+      \item 给定$\mathbf{s}$，$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$是个常数(因为$\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')$的变量只有$\mathbf{s}$)
      \item \textbf{这样，我们得到解码步骤的形式化描述为}
      \end{itemize}

@@ -1304,8 +1304,8 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
 \vspace{-1em}

 \begin{eqnarray}
-\hat{t} & = & \argmax_{t} \frac{g(s,t)}{\sum_{t'}g(s,t')} \nonumber \\
-        & =      & \argmax_{t} g(s,t) \nonumber
+\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{g(\mathbf{s},\mathbf{t})}{\sum_{\mathbf{t}'}g(\mathbf{s},\mathbf{t}')} \nonumber \\
+        & =      & \argmax_{\mathbf{t}} g(\mathbf{s},\mathbf{t}) \nonumber
 \end{eqnarray}

 \end{itemize}
@@ -1317,15 +1317,15 @@ g(s,t)=\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i) \times \textrm{P}_
 \begin{frame}{实现一个简单的机器翻译系统：解码(2)}
 \vspace{0.5em}
 \begin{itemize}
-\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(s,t)$达到最大的译文\\
+\item \textbf{解码的核心问题}是在所有可能的翻译结果中找到使$g(\mathbf{s},\mathbf{t})$达到最大的译文\\
 \vspace{-1em}
 \begin{minipage}[t]{0.58\linewidth}
 \begin{itemize}
-\item 若$s$有$m$个词，每个词有$n$个翻译候选 - 共有$n^m$种组合
+\item 若$\mathbf{s}$有$m$个词，每个词有$n$个翻译候选 - 共有$n^m$种组合
 \vspace{-0.5em}
 \item<2-> 词的翻译候选可以任意调序
 \vspace{-0.5em}
-\item<3-> $s$对应可能的译文至少有$n^m \cdot m!$
+\item<3-> $\mathbf{s}$对应可能的译文至少有$n^m \cdot m!$
 \end{itemize}
 \end{minipage}
 \hfill
@@ -1408,8 +1408,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline

 {\scriptsize

-\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
-\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
+\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
+\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-1pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
 \node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-1pt]line2.south west) {\textrm{2: $best = \phi$}};
 \node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-1pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
 \node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-1pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
@@ -1421,7 +1421,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \node [anchor=north west,inner sep=2pt,align=left] (line11) at ([yshift=-1pt]line10.south west) {\textrm{10: \textbf{return} $best.translatoin$}};

 \node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=1pt]line1.north west) {输出: 找的最佳译文};
-\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
+\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=1pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};

 }

@@ -1602,8 +1602,8 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline

 {\tiny

-\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($s$)}};
-\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($s$)}};
+\node [anchor=north west,inner sep=2pt,align=left] (line1) at (0,0) {\textrm{\textbf{Function} \textsc{WordDecoding}($\mathbf{s}$)}};
+\node [anchor=north west,inner sep=2pt,align=left] (line2) at ([yshift=-3pt]line1.south west) {\textrm{1: $\pi = $\textsc{GetTransOptions}($\mathbf{s}$)}};
 \node [anchor=north west,inner sep=2pt,align=left] (line3) at ([yshift=-3pt]line2.south west) {\textrm{2: $best = \phi$}};
 \node [anchor=north west,inner sep=2pt,align=left] (line4) at ([yshift=-3pt]line3.south west) {\textrm{3: \textbf{for} $i$ in $[1,m]$ \textbf{do}}};
 \node [anchor=north west,inner sep=2pt,align=left] (line5) at ([yshift=-3pt]line4.south west) {\textrm{4: \hspace{1em} $h = \phi$}};
@@ -1620,7 +1620,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 }

 \node [anchor=south west,inner sep=2pt,align=left] (head1) at ([yshift=3pt]line1.north west) {输出: 找的最佳译文};
-\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$s=s_1...s_m$};
+\node [anchor=south west,inner sep=2pt,align=left] (head2) at ([yshift=3pt]head1.north west) {输入: 源语句子$\mathbf{s}=s_1...s_m$};

 }

@@ -1900,7 +1900,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 }

 \visible<6->{
-\node [anchor=north west] (glabel) at (hlabel.south west) {$g(s,t)$};
+\node [anchor=north west] (glabel) at (hlabel.south west) {$g(\mathbf{s},\mathbf{t})$};
 \node [anchor=west] (translabel) at (glabel.east) {翻译结果};
 \draw [-] (glabel.north east) -- ([yshift=-1.9in]glabel.north east);
 \draw [-] (glabel.south west) -- ([xshift=3.5in]glabel.south west);
@@ -2206,7 +2206,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{frame}{机器翻译的统计建模}

 \begin{itemize}
-\item \textbf{一个人在做翻译时}：对于给定的源语言句子$s$，可以了翻译为一个（或者若干个）正确的译文$\hat{t}$
+\item \textbf{一个人在做翻译时}：对于给定的源语言句子$\mathbf{s}$，可以了翻译为一个（或者若干个）正确的译文$\hat{\mathbf{t}}$
    \begin{itemize}
    \item 也就是说除了正确的译文，其它的翻译都是不正确的
    \end{itemize}
@@ -2214,20 +2214,20 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{center}
 \begin{tikzpicture}

-\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
-\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{t}$}};
+\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
+\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t) at ([xshift=1in]s.east) {\black{$\hat{\mathbf{t}}$}};

 \draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t.north west) node[pos=0.5,below] {\tiny{正确翻译}};

 \end{tikzpicture}
 \end{center}

-\item<2-> \textbf{统计机器翻译的思想是}：对于$s$，所有可能的目标语词串$t$都是可能的译文。每一对($s$,$t$)都有一个概率值$\textrm{P}(t|s)$ 来描述$s$ 翻译为$t$的好与坏
+\item<2-> \textbf{统计机器翻译的思想是}：对于$\mathbf{s}$，所有可能的目标语词串$\mathbf{t}$都是可能的译文。每一对($\mathbf{s}$,$\mathbf{t}$)都有一个概率值$\textrm{P}(\mathbf{t}|\mathbf{s})$ 来描述$\mathbf{s}$ 翻译为$\mathbf{t}$的好与坏

 \begin{center}
 \begin{tikzpicture}

-\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
+\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
 \node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t1) at ([xshift=1in]s.east) {\black{$t_1$}};
 \node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t2) at ([xshift=3em,yshift=2em]t1.north east) {\black{$t_2$}};
 \node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=2pt] (t3) at ([xshift=1em,yshift=4em]t1.north east) {\black{$t_3$}};
@@ -2237,10 +2237,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t6) at ([xshift=3em]t2.east) {};
 \node [draw,dashed,ublue,fill=blue!10,thick,anchor=center,circle,minimum size=18pt] (t7) at ([xshift=3em]t4.east) {};

-\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|s$)=0.1}};
-\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|s$)=0.2}};
-\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|s$)=0.3}};
-\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|s$)=0.1}};
+\draw [->,thick,] (s.north east) .. controls +(north east:1em) and +(north west:1em).. (t1.north west) node[pos=0.5,below] {\tiny{P ($t_1|\mathbf{s}$)=0.1}};
+\draw [->,thick,] (s.60) .. controls +(50:4em) and +(west:1em).. (t2.west) node[pos=0.5,below] {\tiny{P($t_2|\mathbf{s}$)=0.2}};
+\draw [->,thick,] (s.north) .. controls +(70:4em) and +(west:1em).. (t3.west) node[pos=0.5,above,xshift=-1em] {\tiny{P($t_3|\mathbf{s}$)=0.3}};
+\draw [->,thick,] (s.south east) .. controls +(300:3em) and +(south west:1em).. (t4.south west) node[pos=0.5,below] {\tiny{P($t_4|\mathbf{s}$)=0.1}};

 \end{tikzpicture}
 \end{center}
@@ -2254,13 +2254,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{frame}{噪声信道模型}

 \begin{itemize}
-\item \textbf{噪声信道模型}：源语言句子$s$(信宿)是由目标语句子$t$(信源)经过一个有噪声的信道得到的。如果知道了$s$和信道的性质，我们可以通过$\textrm{P}(t|s)$得到可能的信源的概率。\\
+\item \textbf{噪声信道模型}：源语言句子$\mathbf{s}$(信宿)是由目标语句子$\mathbf{t}$(信源)经过一个有噪声的信道得到的。如果知道了$\mathbf{s}$和信道的性质，我们可以通过$\textrm{P}(\mathbf{t}|\mathbf{s})$得到可能的信源的概率。\\

 \begin{center}
 \begin{tikzpicture}

-\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$s$}};
-\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$t$}};
+\node [draw,red,fill=red!10,thick,anchor=center,circle,inner sep=3.5pt] (s) at (0,0) {\black{$\mathbf{s}$}};
+\node [draw,ublue,fill=blue!10,thick,anchor=center,circle,inner sep=3.3pt] (t) at ([xshift=1.5in]s.east) {\black{$\mathbf{t}$}};

 \draw [<->,thick,] (s.east) -- (t.west) node [pos=0.5,draw,fill=white] {噪声信道};
 \node [anchor=east] at (s.west) {\scriptsize{信宿}};
@@ -2272,13 +2272,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
    而通过上述过程找到最可能的信源的过程被称之为\alert{解码}

    \begin{displaymath}
-    \hat{t} = \argmax_{t} \textrm{P}(t|s)
+    \hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
    \end{displaymath}

 \item<2-> \textbf{贝叶斯变换}
    \begin{eqnarray}
-    \textrm{P}(t|s) & = & \frac{\textrm{P}(s,t)}{\textrm{P}(s)} \nonumber \\
-                    & = & \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber
+    \textrm{P}(\mathbf{t}|\mathbf{s}) & = & \frac{\textrm{P}(\mathbf{s},\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
+                    & = & \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber
    \end{eqnarray}
 \end{itemize}

@@ -2291,26 +2291,26 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (p) at (0,0) {$\textrm{P}(t|s)$};
+\node [anchor=west] (p) at (0,0) {$\textrm{P}(\mathbf{t}|\mathbf{s})$};
 \node [anchor=west] (eqiv) at (p.east) {=};
-\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
-\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
-\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
+\node [anchor=south west,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
+\node [anchor=west,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
+\node [anchor=north west,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};

 \visible<2->{
-\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(s|t)$};
-\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(t)$};
-\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(s)$};
+\node [anchor=south west,fill=red!20,inner sep=2pt] (transmodel) at ([yshift=-2pt]eqiv.north east) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
+\node [anchor=west,fill=blue!20,inner sep=2pt] (lmmodel) at ([xshift=2pt]transmodel.east) {$\textrm{P}(\mathbf{t})$};
+\node [anchor=north west,fill=green!20,inner sep=2pt] (sp) at ([yshift=4pt,xshift=1.5em]eqiv.south east) {$\textrm{P}(\mathbf{s})$};
 }

 \draw [-] ([yshift=-4pt]transmodel.south west) -- ([yshift=-4pt]lmmodel.south east);

 \visible<2->{
-\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$t$，得到信宿$s$ }\\\footnotesize{的概率，称作\textbf{翻译模型}}};
+\node [anchor=south east,fill=red!20,draw,align=left] (tmmark) at ([xshift=0.5in,yshift=0.4in]p.north) {\footnotesize{给定信源$\mathbf{t}$，得到信宿$\mathbf{s}$ }\\\footnotesize{的概率，称作\textbf{翻译模型}}};

-\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$t$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};
+\node [anchor=west,fill=blue!20,draw,align=left] (lmmark) at ([xshift=0.5in]tmmark.east) {\footnotesize{信源$\mathbf{t}$出现的概率}\\\footnotesize{称作\textbf{语言模型}}};

-\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$s$出现的概率}\\\footnotesize{给定$s$，$\textrm{P}(s)$为\textbf{常量}}};
+\node [anchor=west,fill=green!20,draw,align=left] (smark) at ([xshift=0.2in,yshift=-0.3in]lmmodel.east) {\footnotesize{信宿$\mathbf{s}$出现的概率}\\\footnotesize{给定$\mathbf{s}$，$\textrm{P}(\mathbf{s})$为\textbf{常量}}};

 \draw [->,thick] (transmodel.north) .. controls +(north:1.5em) and + (south:1.5em) .. (tmmark.south);
 \draw [->,thick] (lmmodel.north) .. controls +(north:1.5em) and + (south:1.3em) .. (lmmark.south);
@@ -2326,11 +2326,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \vspace{-0.5em}

 \begin{eqnarray}
-\hat{t} & = & \argmax_{t} \frac{\textrm{P}(s|t) \textrm{P}(t)}{\textrm{P}(s)} \nonumber \\
-        & = & \argmax_{t} \textrm{P}(s|t) \textrm{P}(t) \nonumber
+\hat{\mathbf{t}} & = & \argmax_{\mathbf{t}} \frac{\textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})}{\textrm{P}(\mathbf{s})} \nonumber \\
+        & = & \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t}) \nonumber
 \end{eqnarray}

-即，在所有可能的译文中找到使翻译模型$\textrm{P}(s|t)$和语言模型$\textrm{P}(t)$乘积最大的译文
+即，在所有可能的译文中找到使翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$和语言模型$\textrm{P}(\mathbf{t})$乘积最大的译文

 \end{itemize}
 \
@@ -2344,28 +2344,28 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{frame}{基本问题}

 \begin{displaymath}
-\hat{t} = \argmax_{t} \textrm{P}(s|t) \textrm{P}(t)
+\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{s}|\mathbf{t}) \textrm{P}(\mathbf{t})
 \end{displaymath}

 \begin{itemize}
 \item \textbf{三个基本问题}
    \begin{enumerate}
-    \item \textbf{建模}：如何描述计算$\textrm{P}(s|t)$和$\textrm{P}(t)$的计算方式
-    \item \textbf{训练}：如何获得计算$\textrm{P}(s|t)$和$\textrm{P}(t)$所需的参数
+    \item \textbf{建模}：如何描述计算$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$的计算方式
+    \item \textbf{训练}：如何获得计算$\textrm{P}(\mathbf{s}|\mathbf{t})$和$\textrm{P}(\mathbf{t})$所需的参数
    \item \textbf{解码}：如何完成搜索最优解的过程$argmax$
    \end{enumerate}
 \item<2-> 回忆一下本章开始的实例，是不是有似曾相识的感觉？
    \vspace{0.5em}
    \begin{center}
    \begin{tikzpicture}
-    \node [anchor=west] (e1) at (0,0) {$g(s,t)$};
+    \node [anchor=west] (e1) at (0,0) {$g(\mathbf{s},\mathbf{t})$};
    \node [anchor=west] (e2) at (e1.east) {$=$};
    \node [anchor=west,inner sep=2pt,fill=red!20] (e3) at (e2.east) {$\prod\nolimits_{(j,i) \in \hat{A}} \textrm{P}(s_j,t_i)$};
    \node [anchor=west,inner sep=1pt] (e4) at (e3.east) {$\times$};
-    \node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(t)$};
-    \node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(s|t)$};
+    \node [anchor=west,inner sep=3pt,fill=blue!20] (e5) at (e4.east) {$\textrm{P}_{lm}(\mathbf{t})$};
+    \node [anchor=north west,inner sep=1pt] (n1) at ([xshift=2.5em,yshift=-1em]e1.south west) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
    \node [anchor=north] (n1part2) at ([yshift=0.3em]n1.south) {\scriptsize{\textbf{翻译模型}}};
-    \node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(t)$};
+    \node [anchor=west,inner sep=1pt] (n2) at ([xshift=2em]n1.east) {$\textrm{P}(\mathbf{t})$};
    \node [anchor=north] (n2part2) at ([yshift=0.3em]n2.south) {\scriptsize{\textbf{语言模型}}};
    \draw [->,thick] (e3.south) .. controls +(south:1em) and +(north:1em) .. (n1.north);
    \draw [->,thick] (e5.south) .. controls +(south:1em) and +(70:1em) .. (n2.north);
@@ -2389,13 +2389,13 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \label{ibmmodelingstart}

 \begin{itemize}
-\item \textbf{$\textrm{P}(t)$和解码}在前面的内容中有介绍，下面重点求解$\textrm{P}(s|t)$，即：
+\item \textbf{$\textrm{P}(\mathbf{t})$和解码}在前面的内容中有介绍，下面重点求解$\textrm{P}(\mathbf{s}|\mathbf{t})$，即：
    \begin{itemize}
-    \item \textbf{翻译模型建模} - $\textrm{P}(s|t)$的计算方法
-    \item \textbf{翻译模型参数估计} - 计算$\textrm{P}(s|t)$所需的参数
+    \item \textbf{翻译模型建模} - $\textrm{P}(\mathbf{s}|\mathbf{t})$的计算方法
+    \item \textbf{翻译模型参数估计} - 计算$\textrm{P}(\mathbf{s}|\mathbf{t})$所需的参数
    \end{itemize}
    \vspace{0.5em}
-\item<2-> \textbf{IBM模型的假设}：$s=s_1...s_m$和$t=t_1...t_n$之间有单词一级的对应，称作\alert{单词对齐}或者\alert{词对齐}。此外：
+\item<2-> \textbf{IBM模型的假设}：$\mathbf{s}=s_1...s_m$和$\mathbf{t}=t_1...t_n$之间有单词一级的对应，称作\alert{单词对齐}或者\alert{词对齐}。此外：
    \begin{itemize}
    \item \textbf{约束}：一个源语言单词只能对应一个目标语单词
    \vspace{0.5em}
@@ -2462,10 +2462,10 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline

 %%%------------------------------------------------------------------------------------------------------------
 %%% modeling P(s|t)
-\begin{frame}{建模 - $\textrm{P}(s|t)$}
+\begin{frame}{建模 - $\textrm{P}(\mathbf{s}|\mathbf{t})$}

 \begin{itemize}
-\item 给定$s$和$t$，它们之间的\alert{词对齐}被记为$a=a_1...a_m$
+\item 给定$\mathbf{s}$和$\mathbf{t}$，它们之间的\alert{词对齐}被记为$\mathbf{a}=a_1...a_m$
    \begin{itemize}
    \item $a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置
    \begin{center}
@@ -2486,15 +2486,15 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
    \end{tikzpicture}
    \end{center}
    \end{itemize}
-\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(s|t)$被表示为所有可能的词对齐的生成概率\\
+\item<2-> \textbf{\alert{建模!!}}: $\textrm{P}(\mathbf{s}|\mathbf{t})$被表示为所有可能的词对齐的生成概率\\
    \vspace{-0.5em}
    \begin{displaymath}
-    \textrm{P}(s|t) = \sum_{a} \textrm{P}(s,a|t)
+    \textrm{P}(\mathbf{s}|\mathbf{t}) = \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})
    \end{displaymath}\\
    \vspace{-0.5em}

    \visible<3->{
-    每一种$a$对应一个$\textrm{P}(s,a|t)$
+    每一种$\mathbf{a}$对应一个$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$

    \vspace{-0.8em}
    \begin{center}
@@ -2639,7 +2639,7 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
    \visible<4->{
    \node [anchor=south east,inner sep=0pt] (p) at (t0.north west) {\small{{\color{ugreen} P(}}};
    \node [anchor=south west,inner sep=0pt] (p2) at ([yshift=0.2em]t2.north east) {\small{{\color{ugreen} )}}};
-    \node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($s|t$)}};
+    \node [anchor=west] (eq) at (p2.east) {\normalsize{= \ P($\mathbf{s}|\mathbf{t}$)}};
    }
    }
    \end{scope}
@@ -2652,11 +2652,11 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline

 %%%------------------------------------------------------------------------------------------------------------
 %%% modeling P(s,a|t)
-\begin{frame}{建模 - $\textrm{P}(s,a|t)$}
+\begin{frame}{建模 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}

 \begin{itemize}

-\item \alert{\textbf{进一步建模!!}}：对于源语句子$s=s_1...s_m$($m$个词)、目标语译文$t=t_0...t_n$($n$个词)和词对齐$a=a_1...a_m$，按如下方式计算$\textrm{P}(s,a|t)$
+\item \alert{\textbf{进一步建模!!}}：对于源语句子$\mathbf{s}=s_1...s_m$($m$个词)、目标语译文$\mathbf{t}=t_0...t_n$($n$个词)和词对齐$\mathbf{a}=a_1...a_m$，按如下方式计算$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$

    \begin{itemize}
    \item 符号定义：$s_x^y=s_x...s_y$, $a_x^y=a_x...a_y$
@@ -2668,23 +2668,23 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)=$};
-\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
+\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})=$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
 \node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
-\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
-\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};

 \visible<2->{
-\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|t)$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=red!20] (eq2) at (eq1.east) {$\textrm{P}(m|\mathbf{t})$};
 }
 \visible<3->{
 \node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=blue!20] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
 }
 \visible<4->{
-\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=green!20] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
 }
 \visible<5->{
-\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
+\node [anchor=west,inner sep=1pt,minimum height=2.64em,fill=purple!20] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};
 }

 \visible<2->{
@@ -2706,12 +2706,12 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline
 \vspace{-0.0em}

    \begin{itemize}
-    \item<2-> \textbf{生成模型}：给定译文$t$生成源文$s$和对齐$a$
+    \item<2-> \textbf{生成模型}：给定译文$\mathbf{t}$生成源文$\mathbf{s}$和对齐$\mathbf{a}$
        \begin{enumerate}
-        \item<2-> 根据译文$t$选择源文的长度$m$
+        \item<2-> 根据译文$\mathbf{t}$选择源文的长度$m$
        \item<3-> 循环源文的每个位置$j$
-        \item<4-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$，生成第$j$个位置的对齐结果$a_j$
-        \item<5-> 根据译文$t$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$，生成第$j$个位置的源语言单词$s_j$（注意：这时$a_j$已经生成了）
+        \item<4-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j-1}$，生成第$j$个位置的对齐结果$a_j$
+        \item<5-> 根据译文$\mathbf{t}$、源文长度$m$、已经生成的源语单词$s_{1}^{j-1}$和对齐$a_{1}^{j}$，生成第$j$个位置的源语言单词$s_j$（注意：这时$a_j$已经生成了）
        \end{enumerate}
    \end{itemize}

@@ -2721,9 +2721,9 @@ $m$ & $n$ & $n^m \cdot m!$ \\ \hline

 %%%------------------------------------------------------------------------------------------------------------
 %%% generation of s and a given t -  a running example
-\begin{frame}{实例 - $\textrm{P}(s,a|t)$}
+\begin{frame}{实例 - $\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}

-$s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-1\}
+$\mathbf{s}$ = 在 桌子 上 \ \ \ \ \ $\mathbf{t}$ = $t_0$ on the table \ \ \ \ \ $\mathbf{a}$ = \{1-0,2-3,3-1\}

 \begin{center}
 \begin{tikzpicture}
@@ -2789,7 +2789,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 {\small
 \begin{eqnarray}
-\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \nonumber \\
+\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \nonumber \\
                  & \visible<2->{=} & \visible<2->{\textrm{P}(m=3 \mid \textrm{'$t_0$ on the table'})} \visible<3->{\times} \nonumber \\
                  &   & \visible<3->{\textrm{P}(a_1=0 \mid \phi,\phi,3,\textrm{'$t_0$ on the table'})} \visible<4->{\times} \nonumber \\
                  &   & \visible<4->{\textrm{P}(f_1=\textrm{在} \mid \textrm{\{1-0\}},\phi,3,\textrm{'$t_0$ on the table'})} \visible<5->{\times} \nonumber \\
@@ -2813,14 +2813,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \vspace{-1.0em}

 \begin{eqnarray}
-\textrm{P}(s|t) & = & \sum_{a} \textrm{P}(s,a|t)  \nonumber \\
-\textrm{P}(s,a|t) & = & \textrm{P}(m|t) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)  \nonumber
+\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})  \nonumber \\
+\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) & = & \textrm{P}(m|\mathbf{t}) \prod\limits_{j=1}^{m} \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})  \nonumber
 \end{eqnarray}

 \item \textbf{两个严重问题}
    \begin{enumerate}
-    \item 第一个公式：如何遍历所有的对齐$a$
-    \item 第二个公式：如何计算$\textrm{P}(m|t)$、$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$和$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$
+    \item 第一个公式：如何遍历所有的对齐$\mathbf{a}$
+    \item 第二个公式：如何计算$\textrm{P}(m|\mathbf{t})$、$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$和$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$
    \end{enumerate}

 \item<2-> Brown等人(1993)的解决方法：对问题进行化简
@@ -2848,21 +2848,21 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
    \begin{enumerate}
    \item 源语长度概率为常数$\epsilon$
    \begin{displaymath}
-    \textrm{P}(m|t) \equiv \epsilon
+    \textrm{P}(m|\mathbf{t}) \equiv \epsilon
    \end{displaymath}
-    \item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$仅依赖于译文长度$l+1$(均匀分布)
+    \item 对齐概率$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$仅依赖于译文长度$l+1$(均匀分布)
    \begin{displaymath}
-    \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l+1}
+    \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l+1}
    \end{displaymath}
-    \item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$仅依赖与其对齐的译文单词$t_{a_j}$，即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
+    \item 源语单词$s_j$生成概率$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$仅依赖与其对齐的译文单词$t_{a_j}$，即词汇翻译概率$f(s_j|t_{a_j})$ ($\sum_{s_j} f(s_j|t_{a_j}) = 1$)
    \begin{displaymath}
-    \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})
+    \textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})
    \end{displaymath}
    \end{enumerate}

 \item<2-> \textbf{核心思想是}把复杂参数化简为简单参数
    \begin{itemize}
-    \item 比如：$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,t)$化简为$l$
+    \item 比如：$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t}) \equiv \frac{1}{l-1}$把参数空间$(a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$化简为$l$
    \item \alert{优点}: 模型大大化简；\alert{缺点}：化简导致模型不准确
    \end{itemize}

@@ -2882,12 +2882,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
+\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
 \node [anchor=west] (eq1part2) at (eq1.east) {$=$};
-\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
-\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
-\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
-\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
+\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
+\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
+\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
+\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};

 \node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
 \node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
@@ -2906,13 +2906,13 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \end{tikzpicture}
 \end{center}

-\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
+\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}

 \vspace{-1.0em}

 \begin{eqnarray}
-\textrm{P}(s|t) & = & \sum\limits_{a} \textrm{P}(s,a|t) \nonumber \\
-                & = & \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
+\textrm{P}(\mathbf{s}|\mathbf{t}) & = & \sum\limits_{\mathbf{a}} \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t}) \nonumber \\
+                & = & \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber
 \end{eqnarray}

 \end{itemize}
@@ -2925,14 +2925,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 \begin{itemize}

-\item $\textrm{P}(s|t) = \sum\limits_{a} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐，即$\sum_{a}$。这个过程可以被重新表示为
+\item $\textrm{P}(\mathbf{s}|\mathbf{t}) = \sum\limits_{\mathbf{a}} \frac{\epsilon}{(l+1)^{m}} \prod\limits_{j=1}^{m} f(s_j|t_{a_j})$中需要对遍历所有的对齐，即$\sum_{\mathbf{a}}$。这个过程可以被重新表示为

 \vspace{-0.5em}

 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
+\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
 \node [anchor=west] (eq2) at (eq1.east) {$=$};
 \node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
 \node [anchor=west,inner sep=0] (eq4) at (eq3.east) {...};
@@ -2968,7 +2968,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \end{center}

    \begin{enumerate}
-    \item<2-> 遍历所有的的对齐$a$。$a$由\{$a_1$,...,$a_m$\}组成，每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)
+    \item<2-> 遍历所有的的对齐$\mathbf{a}$。$\mathbf{a}$由\{$a_1$,...,$a_m$\}组成，每个$a_j \in \{a_1,...,a_m\}$从第译文开始位置(0)循环到截止位置($l$)

    \vspace{0.5em}
    \begin{center}
@@ -3003,7 +3003,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
    \end{center}
    \vspace{0.5em}

-    \item<5-> 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$
+    \item<5-> 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$
    \end{enumerate}

 \end{itemize}
@@ -3026,28 +3026,28 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 \begin{scope}

-\node [anchor=west] (s1) at (0,0) {$s$ = 在\ \ 桌子\ \ 上};
-\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$t$ = on\ \ the\ \ table};
+\node [anchor=west] (s1) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ 上};
+\node [anchor=west] (t1) at ([yshift=-2em]s1.west) {$\mathbf{t}$ = on\ \ the\ \ table};
 \draw [->,double,thick,ublue] ([yshift=0.2em]s1.south) -- ([yshift=-0.8em]s1.south);

 \end{scope}

 \begin{scope}[xshift=1.5in]

-\node [anchor=west] (s2) at (0,0) {$s$ = 在\ \ 桌子\ \ 上};
-\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$t'$ = table \ on\ \ the};
+\node [anchor=west] (s2) at (0,0) {$\mathbf{s}$ = 在\ \ 桌子\ \ 上};
+\node [anchor=west] (t2) at ([yshift=-2em]s2.west) {$\mathbf{t}'$ = table \ on\ \ the};
 \draw [->,double,thick,ublue] ([yshift=0.2em]s2.south) -- ([yshift=-0.8em]s2.south);

 \end{scope}

-\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(s|t)$};
-\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(s|t')$};
+\node [anchor=north] (score11) at ([yshift=-2.0em]s1.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
+\node [anchor=north] (score12) at ([yshift=-2.0em]s2.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
 \node [anchor=west] (comp1) at ([xshift=2.3em]score11.east) {\large{$\mathbf{=}$}};
 \node [anchor=east] (label1) at ([xshift=-1em,yshift=0.1em]score11.west) {\textbf{IBM模型1:}};

 \visible<2->{
-\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(s|t)$};
-\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(s|t')$};
+\node [anchor=north] (score21) at ([yshift=0.2em]score11.south) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
+\node [anchor=north] (score22) at ([yshift=0.2em]score12.south) {$\textrm{P}(\mathbf{s}|\mathbf{t}')$};
 \node [anchor=west] (comp2) at ([xshift=2.3em]score21.east) {\large{$\mathbf{>}$}};
 \node [anchor=east] (label2) at ([xshift=-1em,yshift=0.1em]score21.west) {\textbf{理想:}};
 }
@@ -3064,7 +3064,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
    \textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t) \equiv a(a_j|j,m,l)
    \end{displaymath}

-    其它假设与IBM模型1相同，即$\textrm{P}(m|t) \equiv \epsilon$和$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t) \equiv f(s_j|t_{a_j})$
+    其它假设与IBM模型1相同，即$\textrm{P}(m|\mathbf{t}) \equiv \epsilon$和$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t}) \equiv f(s_j|t_{a_j})$

 \end{itemize}

@@ -3083,12 +3083,12 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s,a|t)$};
+\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$};
 \node [anchor=west] (eq1part2) at (eq1.east) {$=$};
-\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|t)$};
-\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=1pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
-\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=1pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,t)$};
-\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=1pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,t)$};
+\node [anchor=west,inner sep=1pt] (eq2) at (eq1part2.east) {$\textrm{P}(m|\mathbf{t})$};
+\node [anchor=west,inner sep=1pt] (eq3) at ([xshift=0pt]eq2.east) {$\prod\limits_{j=1}^{m}$};
+\node [anchor=west,inner sep=1pt] (eq4) at ([xshift=0pt]eq3.east) {$\textrm{P}(a_j|a_{1}^{j-1},s_{1}^{j-1},m,\mathbf{t})$};
+\node [anchor=west,inner sep=1pt] (eq5) at ([xshift=0pt]eq4.east) {$\textrm{P}(s_j|a_{1}^{j},s_{1}^{j-1},m,\mathbf{t})$};

 \node [anchor=east,rotate=90] (yes1) at (eq2.south) {$\equiv$};
 \node [anchor=east,rotate=90] (yes2) at (eq4.south) {$\equiv$};
@@ -3104,14 +3104,14 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \end{tikzpicture}
 \end{center}

-\item<2-> \textbf{将上式代入$\textrm{P}(s|t)=\sum_a \textrm{P}(s,a|t)$}
+\item<2-> \textbf{将上式代入$\textrm{P}(\mathbf{s}|\mathbf{t})=\sum_a \textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$}

 \vspace{-0.5em}

 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(s|t)$};
+\node [anchor=west] (eq1) at (0,0) {$\textrm{P}(\mathbf{s}|\mathbf{t})$};
 \node [anchor=west] (eq2) at (eq1.east) {$=$};
 \node [anchor=west,inner sep=2pt] (eq3) at (eq2.east) {$\sum\limits_{a_1=0}^{l}$};
 \node [anchor=west,inner sep=0] (eq4) at ([xshift=-0.2em]eq3.east) {...};
@@ -3142,8 +3142,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \vspace{-0.5em}

    \begin{enumerate}
-    \item 遍历所有的的对齐$a$
-    \item 对于每个$a$累加对齐概率$\textrm{P}(s,a|t)$，即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
+    \item 遍历所有的的对齐$\mathbf{a}$
+    \item 对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$，即计算$\epsilon \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$
    \end{enumerate}

 \end{itemize}
@@ -3158,8 +3158,8 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \vspace{-1.0em}

 \begin{eqnarray}
-\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
-\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
+\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
+\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
 \end{eqnarray}

 \begin{itemize}
@@ -3227,17 +3227,17 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \vspace{-1.0em}

 \begin{eqnarray}
-\textrm{\textbf{IBM模型1}}: \textrm{P}(s|t) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
-\textrm{\textbf{IBM模型2}}: \textrm{P}(s|t) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
+\textrm{\textbf{IBM模型1}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \frac{\epsilon}{(l+1)^{m}} \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} f(s_j|t_{a_j}) \nonumber \\
+\textrm{\textbf{IBM模型2}}: \textrm{P}(\mathbf{s}|\mathbf{t}) & = & \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j}) \nonumber
 \end{eqnarray}

 \begin{spacing}{1.2}
 \begin{itemize}

-\item 对于翻译模型$\textrm{P}(s|t)$，再来回顾一下统计机器翻译的三个基本问题
+\item 对于翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$，再来回顾一下统计机器翻译的三个基本问题
    \begin{enumerate}
-    \item \textbf{建模}：如何描述$\textrm{P}(s|t)$ \visible<2->{\alert{$\gets$ 已解！见上面两个公式}}
-    \item \textbf{解码}：给定模型参数$\epsilon$、$a(a_j|j,m,l)$和$f(s_j|t_{a_j})$，如何利用上面的公式计算$\textrm{P}(s|t)$(语言模型计算暂不讨论)，并找到最佳译文$\hat{t}$ \visible<2->{\alert{$\gets$ 下面讨论}}
+    \item \textbf{建模}：如何描述$\textrm{P}(\mathbf{s}|\mathbf{t})$ \visible<2->{\alert{$\gets$ 已解！见上面两个公式}}
+    \item \textbf{解码}：给定模型参数$\epsilon$、$a(a_j|j,m,l)$和$f(s_j|t_{a_j})$，如何利用上面的公式计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(语言模型计算暂不讨论)，并找到最佳译文$\hat{\mathbf{t}}$ \visible<2->{\alert{$\gets$ 下面讨论}}
    \item \textbf{训练}；如何从数据中自动学习模型参数$\epsilon$、$a(a_j|j,m,l)$和$f(s_j|t_{a_j})$ \visible<2->{\alert{$\gets$ 下面讨论}}
    \end{enumerate}

@@ -3281,7 +3281,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(s|t) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
+\node [anchor=west] (model) at (0,0) {\footnotesize{$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon \sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l} \prod\limits_{j=1}^{m} a(a_j|j,m,l) f(s_j|t_{a_j})$}};
 \node [anchor=east] (modellabel) at ([yshift=0.1em]model.west) {\footnotesize{\textbf{问题的统计描述:}}};
 \node [anchor=north west] (paras) at (model.south west) {\footnotesize{$\epsilon = ?;\ \ \forall a_j,j,m,l: a(a_j|j,m,l) = ?, f(s_j|t_{a_j}) = ?$}};
 \node [anchor=east] (paraslabel) at ([yshift=0.1em]paras.west) {\footnotesize{\textbf{模型的参数:}}};
@@ -3300,9 +3300,9 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{center}
 \begin{tikzpicture}

-\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$s$和$t$，(高效地)计算$\textrm{P}(s|t)$(同时计算$\textrm{P}(t)$)}};
+\node [anchor=west] (scoring) at (0,0) {\footnotesize{对任意的$\mathbf{s}$和$\mathbf{t}$，(高效地)计算$\textrm{P}(\mathbf{s}|\mathbf{t})$(同时计算$\textrm{P}(\mathbf{t})$)}};
 \node [anchor=east] (scoringlabel) at ([yshift=0.1em]scoring.west) {\footnotesize{\textbf{模型得分计算:}}};
-\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$t$，找到模型得分($\textrm{P}(s|t)\textrm{P}(t)$)最高}};
+\node [anchor=north west] (search) at (scoring.south west) {\footnotesize{对所有可能的$\mathbf{t}$，找到模型得分($\textrm{P}(\mathbf{s}|\mathbf{t})\textrm{P}(\mathbf{t})$)最高}};
 \node [anchor=north west] (searchpart2) at ([yshift=0.3em]search.south west) {\footnotesize{的译文输出}};
 \node [anchor=east] (searchlabel) at ([yshift=0.1em]search.west) {\footnotesize{\textbf{搜索:}}};

@@ -3317,7 +3317,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

    \begin{itemize}
    \item \textbf{搜索(解码)问题}前面的实例已经描述了一种解法（见{\color{ublue} \hyperref[simpledecodingalgorithm]{第\hspace{-0.2em}~\ref{simpledecodingalgorithm}页}}）：自左向右添加译文单词 + 剪枝技术。这里不再讨论，可以自行学习
-    \item \textbf{剩下的问题是}：对于任意的$s$和$t$，如何\alert{高效地}计算$\textrm{P}(s|t)$
+    \item \textbf{剩下的问题是}：对于任意的$\mathbf{s}$和$\mathbf{t}$，如何\alert{高效地}计算$\textrm{P}(\mathbf{s}|\mathbf{t})$
    \end{itemize}

 \end{itemize}
@@ -3333,7 +3333,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \item $O((l+1)^m \cdot m)$ - IBM模型得分的直接计算几乎不可能！

 \begin{displaymath}
-\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
+\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} \underbrace{\sum\limits_{a_1=0}^{l} ... \sum\limits_{a_m=0}^{l}}_{(l+1)^m\textrm{次循环}} \underbrace{\prod\limits_{j=1}^{m} f(s_j|t_{a_j})}_{m\textrm{次循环}}
 \end{displaymath}

 \item<2-> $O(l \cdot m)$ - 实际上我们可以做的更好
@@ -3355,7 +3355,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 }

 \visible<3->{
-\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(s|t) = \frac{\epsilon}{(l+1)^{m}} $};
+\node [anchor=west] (eq2) at ([xshift=5em,yshift=-4.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \frac{\epsilon}{(l+1)^{m}} $};
 \node [anchor=west,inner sep=2pt] (eq2part2) at ([xshift=-0.3em]eq2.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} f(s_j|t_i)$};
 \node [anchor=east] (eq2label) at ([xshift=-0em,yshift=0.2em]eq2.west) {\small{IBM模型1:}};

@@ -3363,7 +3363,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 }

 \visible<4->{
-\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(s|t) = \epsilon$};
+\node [anchor=west] (eq3) at ([xshift=5em,yshift=-7.5em]eq1.west) {$\textrm{P}(\mathbf{s}|\mathbf{t}) = \epsilon$};
 \node [anchor=west,inner sep=2pt] (eq3part2) at ([xshift=-0.3em]eq3.east) {$\prod\limits_{j=1}^{m} \sum\limits_{i=0}^{l} a(i|j,m,l) f(s_j|t_i)$};
 \node [anchor=east] (eq3label) at ([xshift=-0em,yshift=0.2em]eq3.west) {\small{类似的，IBM模型2:}};
 }
@@ -3525,7 +3525,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 \begin{itemize}

-\item<2-> \textbf{IBM模型的训练：对于给定的句对$(s,t)$，最大化翻译概率$\textrm{P}(s|t)$}。这里用符号$\textrm{P}_{\theta}(s|t)$表示概率由参数$\theta$决定
+\item<2-> \textbf{IBM模型的训练：对于给定的句对$(\mathbf{s},\mathbf{t})$，最大化翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}。这里用符号$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$表示概率由参数$\theta$决定

 \begin{center}
 \begin{tikzpicture}
@@ -3534,11 +3534,11 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \node [anchor=west] (eq2) at ([yshift=-0.2em]eq1.east) {=};
 \node [anchor=west,inner sep=2pt] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
 \node [anchor=north,inner sep=1pt] (eq3part2) at ([yshift=-0.2em]eq3.south) {\scriptsize{$\theta$}};
-\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
+\node [anchor=west,inner sep=2pt] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};

 \visible<3->{
 \node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.35em] (eq3) at ([yshift=-0.0em]eq2.east) {$\argmax$};
-\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(s|t)$};
+\node [anchor=west,inner sep=2pt,fill=green!20] (eq4) at ([xshift=0.1em]eq3.east) {$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$};

 \node [anchor=north,draw,inner sep=3pt,fill=red!20] (eq3label) at ([yshift=-1.5em]eq3.south west) {\footnotesize{\textbf{求最优参数}}};
 \node [anchor=north,draw,inner sep=3pt,fill=green!20] (eq4label) at ([yshift=-1.5em]eq4.south east) {\footnotesize{\textbf{目标函数}}};
@@ -3561,23 +3561,23 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 \begin{itemize}

-\item $\textrm{P}(s|t)$可以被看做是$(s,t)$上的\alert{似然}函数($L(s,t;\theta)$)。所谓\alert{极大似然估计}，就是要找到使$L(s,t;\theta)$达到最大的$\theta$：
+\item $\textrm{P}(\mathbf{s}|\mathbf{t})$可以被看做是$(\mathbf{s},\mathbf{t})$上的\alert{似然}函数($L(\mathbf{s},\mathbf{t};\theta)$)。所谓\alert{极大似然估计}，就是要找到使$L(\mathbf{s},\mathbf{t};\theta)$达到最大的$\theta$：

    \vspace{-0.5em}

    \begin{displaymath}
-    \{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(s,t;\theta)\}
+    \{\hat{\theta}\} \subseteq \{\argmax_{\theta \in \Theta} L(\mathbf{s},\mathbf{t};\theta)\}
    \end{displaymath}

    \vspace{-0.3em}

-    $L(s,t;\theta)$表示$L(\cdot)$依赖模型参数$\theta$（注意分号），$\{\hat{\theta}\}$表示可能有多组结果，$\Theta$表示参数空间
+    $L(\mathbf{s},\mathbf{t};\theta)$表示$L(\cdot)$依赖模型参数$\theta$（注意分号），$\{\hat{\theta}\}$表示可能有多组结果，$\Theta$表示参数空间

 \vspace{0.5em}

-\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题：如何找到一组$\theta$使$\textrm{P}_{\theta}(s|t)$达到最大？\\
+\item<2-> 先不用考虑上面的公式。我们还是回归到原始问题：如何找到一组$\theta$使$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$达到最大？\\
    \begin{itemize}
-    \item \textbf{求函数最大值问题}。比如，我们可以对$\textrm{P}_{\theta}(s|t)$求导，令导数为零，得到极值点
+    \item \textbf{求函数最大值问题}。比如，我们可以对$\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t})$求导，令导数为零，得到极值点
    \end{itemize}

 \end{itemize}
@@ -3641,7 +3641,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-

 %%%------------------------------------------------------------------------------------------------------------
 %%% maximizing P(s|t)
-\begin{frame}{最大化$\textrm{P}(s|t)$}
+\begin{frame}{最大化$\textrm{P}(\mathbf{s}|\mathbf{t})$}

 \begin{itemize}

@@ -3710,7 +3710,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{itemize}

 \item \textbf{含有约束的优化问题}: 不好解\\
-      \textbf{目标:} $\max(\textrm{P}_{\theta}(s|t))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$
+      \textbf{目标:} $\max(\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t}))$ + \textbf{约束:} $\forall t_y: \sum_{s_x} \textrm{P}(s_x|t_y)=1$

 \vspace{0.3em}
 \item<2-> \textbf{解决方法}: 含有约束优化 $\Rightarrow$ 不含约束优化\\
@@ -3746,7 +3746,7 @@ $s$ = 在 桌子 上 \ \ \ \ \ $t$ = $t_0$ on the table \ \ \ \ \ $a$ = \{1-0,2-3,3-
 \begin{center}
 \begin{tabular}{c | c}
 \makebox[0.35\textwidth][c]{\textbf{原始问题}} & \makebox[0.35\textwidth][c]{\textbf{转化后的问题}} \\ \hline
-$\max (\textrm{P}(s|t))$ & $\max (L(f,\lambda))$ \\
+$\max (\textrm{P}(\mathbf{s}|\mathbf{t}))$ & $\max (L(f,\lambda))$ \\
 s.t. $\forall t_y: \sum_{s_x} f(s_x|t_y) =1 $ & \\
 \end{tabular}
 \end{center}
@@ -4022,11 +4022,11 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi
 }

 \visible<2->{
-\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(s|t)$}}};
+\node [anchor=south west,inner sep=2pt] (label1) at (eq4.north west) {\textbf{\scriptsize{翻译概率$\textrm{P}(\mathbf{s}|\mathbf{t})$}}};
 }
 \visible<3->{
 \node [anchor=south west,inner sep=2pt] (label2) at (eq5.north west) {\textbf{\scriptsize{配对的总次数}}};
-\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(s,t)$中}}};
+\node [anchor=south west,inner sep=2pt] (label2part2) at ([yshift=-3pt]label2.north west) {\textbf{\scriptsize{$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中}}};
 }
 \visible<4->{
 \node [anchor=south west,inner sep=2pt] (label3) at (eq6.north west) {\textbf{\scriptsize{有的$t_i$的相对值}}};
@@ -4035,7 +4035,7 @@ f(s_u|t_v) = \frac{\lambda_{t_v}^{-1} \epsilon}{(l+1)^{m}} \cdot \frac{\sum\limi

 \visible<2->{
 \node [anchor=east,rotate=90] (neweq1) at ([yshift=-0em]eq4.south) {=};
-\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(s|t)$}};
+\node [anchor=north,inner sep=1pt] (neweq1full) at (neweq1.west) {\large{$\textrm{P}(\mathbf{s}|\mathbf{t})$}};
 }

 \visible<5->{
@@ -4123,16 +4123,16 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
 \end{center}

 \begin{itemize}
-\item<5-> \textbf{定义}：在$\textrm{P}(s|t)$中，$t_v$翻译(连接)到$s_u$的期望频次为
+\item<5-> \textbf{定义}：在$\textrm{P}(\mathbf{s}|\mathbf{t})$中，$t_v$翻译(连接)到$s_u$的期望频次为
    \vspace{-0.5em}
    \begin{displaymath}
-    c_{\mathbb{E}}(s_u|t_v;s,t) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
+    c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \equiv \sum\limits_{j=1}^{m} \delta(s_j,s_u) \sum\limits_{i=0}^{l} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}
    \end{displaymath}
    \vspace{-0.8em}
 \item<6-> \textbf{重写$f(s_u|t_v)$}!!!
    \begin{center}
    \begin{tikzpicture}
-    \node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t)$}};
+    \node [anchor=west,draw,red,thick,inner sep=5pt] (p) at (0,0) {\black{$f(s_u|t_v) = \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$}};
    \end{tikzpicture}
    \end{center}
 \end{itemize}
@@ -4144,13 +4144,13 @@ $x_3$ & 5 & 0.2 & 1.0 \\ \hline
 \begin{frame}{通过期望频次计算$f(s_u|t_v)$}

 \begin{itemize}
-\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(s|t)}$
+\item \textbf{一个小trick}: 令$\lambda_{t_v}^{'}=\frac{\lambda_{t_v}}{\textrm{P}(\mathbf{s}|\mathbf{t})}$

 \vspace{-1.0em}

 \begin{eqnarray}
-f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber \\
-                    & = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;s,t) \nonumber
+f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(\mathbf{s}|\mathbf{t}) \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber \\
+                    & = & (\lambda_{t_v}^{'})^{-1} \cdot c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) \nonumber
 \end{eqnarray}

 \item<2-> \textbf{$\lambda_{t_v}^{'}$究竟是什么？} - 回忆一下IBM模型对$f(\cdot|\cdot)$的约束
@@ -4163,7 +4163,7 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
 \vspace{-0.3em}

 \begin{displaymath}
-\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)
+\lambda_{t_v}^{'}=\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})
 \end{displaymath}

 \vspace{-0.6em}
@@ -4173,8 +4173,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
    \begin{tikzpicture}
    \node [anchor=west] (eq1) at (0,0) {$f(s_u|t_v) =$};
    \draw [-] (eq1.east) -- ([xshift=8em]eq1.east);
-    \node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;s,t)$};
-    \node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;s,t)$};
+    \node [anchor=south west] (eq2) at ([xshift=1em]eq1.east) {$c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};
+    \node [anchor=north west] (eq3) at (eq1.east) {$\sum_{s_u} c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t})$};

    \begin{pgfonlayer}{background}
    \node[rectangle,draw,red,thick,inner sep=0] [fit = (eq1) (eq2) (eq3)] {};
@@ -4190,9 +4190,9 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
 %%% scale it up to the full corpus
 \begin{frame}{在整个数据集上计算}
 \begin{itemize}
-\item \textbf{更真实的情况}：我们拥有一系列互译的句对（称作\alert{平行语料}），记为$\{(s^{[1]},t^{[1]}),(s^{[2]},t^{[2]}),...,(s^{[N]},t^{[N]})\}$。对于这$N$个训练用句对，定义$f(s_u|t_v)$的期望频次为
+\item \textbf{更真实的情况}：我们拥有一系列互译的句对（称作\alert{平行语料}），记为$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),(\mathbf{s}^{[2]},\mathbf{t}^{[2]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$。对于这$N$个训练用句对，定义$f(s_u|t_v)$的期望频次为
    \begin{displaymath}
-    c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})
+    c_{\mathbb{E}}(s_u|t_v) = \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})
    \end{displaymath}
 \item<2-> \textbf{于是}
    \begin{center}
@@ -4200,8 +4200,8 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
    \node [anchor=west,inner sep=2pt] (eq1) at (0,0) {$f(s_u|t_v)$};
    \node [anchor=west] (eq2) at (eq1.east) {$=$\ };
    \draw [-] ([xshift=0.3em]eq2.east) -- ([xshift=11.6em]eq2.east);
-    \node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
-    \node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[i]},t^{[i]})$};
+    \node [anchor=south west] (eq3) at ([xshift=1em]eq2.east) {$\sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};
+    \node [anchor=north west] (eq4) at (eq2.east) {$\sum_{s_u} \sum_{i=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[i]},\mathbf{t}^{[i]})$};

    \visible<4->{
    \node [anchor=south] (label1) at ([yshift=-6em,xshift=3em]eq1.north west) {利用这个公式计算};
@@ -4250,17 +4250,17 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
 \label{ibmtraining}

 \begin{beamerboxesrounded}[upper=uppercolblue,lower=lowercolblue,shadow=true]{IBM模型1的训练（EM算法）}
-输入: 平行语料$\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$\\
+输入: 平行语料$\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$\\
 输出：参数$f(\cdot|\cdot)$的最优值\\
-1: \textbf{Function} \textsc{TrainItWithEM}($\{(s^{[1]},t^{[1]}),...,(s^{[N]},t^{[N]})\}$) \\
+1: \textbf{Function} \textsc{TrainItWithEM}($\{(\mathbf{s}^{[1]},\mathbf{t}^{[1]}),...,(\mathbf{s}^{[N]},\mathbf{t}^{[N]})\}$) \\
 2: \ \ Initialize $f(\cdot|\cdot)$ \hspace{5em} $\rhd$ 比如给$f(\cdot|\cdot)$一个均匀分布\\
 3: \ \ Loop until $f(\cdot|\cdot)$ converges\\
 4: \ \ \ \ \textbf{foreach} $k = 1$ to $N$ \textbf{do}\\
-5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) = \sum\limits_{j=1}^{|s^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|t^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
-6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{t^{[1]},...,t^{[N]}\}$ \textbf{do}\\
-7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})$\\
-8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{s^{[1]},...,s^{[N]}\}$ \textbf{do}\\
-9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
+5: \ \ \ \ \ \ \ \footnotesize{$c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) = \sum\limits_{j=1}^{|\mathbf{s}^{[k]}|} \delta(s_j,s_u) \sum\limits_{i=0}^{|\mathbf{t}^{[k]}|} \delta(t_i,t_v) \cdot \frac{f(s_u|t_v)}{\sum_{i=0}^{l}f(s_u|t_i)}$}\normalsize{}\\
+6: \ \ \ \ \textbf{foreach} $t_v$ appears at least one of $\{\mathbf{t}^{[1]},...,\mathbf{t}^{[N]}\}$ \textbf{do}\\
+7: \ \ \ \ \ \ \ $\lambda_{t_v}^{'} = \sum_{s_u} \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})$\\
+8: \ \ \ \ \ \ \ \textbf{foreach} $s_u$ appears at least one of $\{\mathbf{s}^{[1]},...,\mathbf{s}^{[N]}\}$ \textbf{do}\\
+9: \ \ \ \ \ \ \ \ \ $f(s_u|t_v) = \sum_{k=1}^{N} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]}) \cdot (\lambda_{t_v}^{'})^{-1}$\\
 10: \ \textbf{return} $f(\cdot|\cdot)$
 \end{beamerboxesrounded}
 \vspace{-0.3em}
@@ -4280,15 +4280,15 @@ f(s_u|t_v) & = & \lambda_{t_v}^{-1} \cdot \textrm{P}(s|t) \cdot c_{\mathbb{E}}(s
    \end{itemize}
 \end{itemize}
 \begin{enumerate}
-\item \textbf{E-Step} (对于句对$(s,t)$，$m=|s|,l=|t|$)
+\item \textbf{E-Step} (对于句对$(\mathbf{s},\mathbf{t})$，$m=|\mathbf{s}|,l=|\mathbf{t}|$)
 \begin{eqnarray}
-c_{\mathbb{E}}(s_u|t_v;s,t) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
-c_{\mathbb{E}}(i|j,m,l;s,t) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
+c_{\mathbb{E}}(s_u|t_v;\mathbf{s},\mathbf{t}) & = & \sum_{j=1}^{m} \sum_{i=0}^{l} \frac{f(s_u|t_v)a(i|j,m,l)\delta(s_j,s_u)\delta(t_i,t_v)}{\sum_{k=0}^{l} f(s_u|t_k)a(k|j,m,l)} \nonumber \\
+c_{\mathbb{E}}(i|j,m,l;\mathbf{s},\mathbf{t}) & = & \frac{f(s_j|t_i)a(i|j,m,l)}{\sum_{k=0}^{l} f(s_j|t_k)a(k|j,m,l)} \nonumber
 \end{eqnarray}
 \item \textbf{M-Step}
 \begin{eqnarray}
-f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;s^{[k]},t^{[k]})}  \nonumber \\
-a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;s^{[k]},t^{[k]})} \nonumber
+f(s_u|t_v) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{s_u} \sum_{k=0}^{K} c_{\mathbb{E}}(s_u|t_v;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}  \nonumber \\
+a(i|j,m,l) & = & \frac{\sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})}{\sum_{i} \sum_{k=0}^{K} c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^{[k]})} \nonumber
 \end{eqnarray}
 \end{enumerate}
 \end{frame}

--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -3706,8 +3706,8 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \subsection{基于chart的解码}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  CYK解码
-\begin{frame}{CYK解码}
+%%%  CKY解码
+\begin{frame}{CKY解码}
 % 看NiuTrans Manual
 \begin{itemize}
 \item 基于层次短语的翻译解码与基于短语的模型类似，都是要找到使$\textrm{score}(d)$达到最大的翻译推导$d$
@@ -3717,8 +3717,8 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \end{displaymath}
 \vspace{-0.8em}
    \begin{itemize}
-    \item 由于翻译推导由SCFG构成，使用CYK算法进行解码
-	\item CYK算法解码是一个用来判定任意给定的字符串 是否属于一个上下文无关文法的算法，具体流程如下
+    \item 由于翻译推导由SCFG构成，使用CKY算法进行解码
+	\item CKY算法解码是一个用来判定任意给定的字符串 是否属于一个上下文无关文法的算法，具体流程如下
    \end{itemize}
 \vspace{0.5em}
 \begin{center}
@@ -3740,16 +3740,16 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \end{tikzpicture}
 \end{center}
 \vspace{0.3em}
-%\item 由于对文法中的非终结符进行了限制，可以直接使用CYK算法进行解码，无需转换成乔姆斯基范式
+%\item 由于对文法中的非终结符进行了限制，可以直接使用CKY算法进行解码，无需转换成乔姆斯基范式
 \end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  CYK解码
-\begin{frame}{CYK算法}
+%%%  CKY解码
+\begin{frame}{CKY算法}
 % 看NiuTrans Manual
 \begin{itemize}
-\item CYK算法通过遍历不同\alert{span}来判断字符串是否符合文法
+\item CKY算法通过遍历不同\alert{span}来判断字符串是否符合文法
 	\begin{itemize}
 	\item 输入：源语串\textbf{s =} $s_1 ... s_J$，以及CNF文法$G$
 	\item 输出：判断字符串是否符合G
@@ -3762,7 +3762,7 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \tikzstyle{srcnode} = [anchor=south west]
 \begin{scope}[scale=0.85]

-\node[srcnode] (c1) at (0,0) {\small{\textbf{Function} CYK-Algorithm($\textbf{s},G$)}};
+\node[srcnode] (c1) at (0,0) {\small{\textbf{Function} CKY-Algorithm($\textbf{s},G$)}};
 \node[srcnode,anchor=north west] (c21) at ([xshift=1.5em,yshift=0.4em]c1.south west) {\small{\textbf{fore} $j=0$ to $ J - 1$}};
 \node[srcnode,anchor=north west] (c22) at ([xshift=1.5em,yshift=0.4em]c21.south west) {\small{$span[j,j+1 ]$.Add($A \to a \in G$)}};
 \node[srcnode,anchor=north west] (c3) at ([xshift=-1.5em,yshift=0.4em]c22.south west) {\small{\textbf{for} $l$ = 1 to $J$}};
@@ -3810,11 +3810,11 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  CYK解码
-\begin{frame}{CYK算法}
+%%%  CKY解码
+\begin{frame}{CKY算法}
 % 看NiuTrans Manual
 \begin{itemize}
-\item 我们来看一个CYK算法的具体例子，给定一个上下无关文法以及一个单词\alert{aabbc}，来判断该单词是否属于此文法，解析流程如下
+\item 我们来看一个CKY算法的具体例子，给定一个上下无关文法以及一个单词\alert{aabbc}，来判断该单词是否属于此文法，解析流程如下
 \vspace{-0.3em}
 \begin{center}
 \begin{tikzpicture}
@@ -3946,11 +3946,11 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  CYK解码
-\begin{frame}{CYK解码（续）}
+%%%  CKY解码
+\begin{frame}{CKY解码（续）}
 % 看NiuTrans Manual
 \begin{itemize}
-\item 实际上，在层次短语解码的时候，不能直接使用CYK算法，需要先转化为乔姆斯基范式，才能进行解码
+\item 实际上，在层次短语解码的时候，不能直接使用CKY算法，需要先转化为乔姆斯基范式，才能进行解码
    \begin{itemize}
    \item<2-> 对于每个源语句子，使用短语规则表初始化它的span
    \item<3-> 自底向上对span中的每个子span进行重新组合（正、反向）
@@ -4166,7 +4166,7 @@ d = r_1 \circ r_2 \circ r_3 \circ r_4
 % 实验结果
 \begin{itemize}
 \item 从实验结果中可以看出，基于层次短语的翻译模型性能要优于基于短语的翻译模型
-\item 选择使用层次短语信息实际上增加了模型的复杂度，但是可以通过借鉴基于短语的翻译模型模型以及CYK解码和立方剪枝等技术来解决
+\item 选择使用层次短语信息实际上增加了模型的复杂度，但是可以通过借鉴基于短语的翻译模型模型以及CKY解码和立方剪枝等技术来解决
 \item 可以考虑加入更多句法信息来进一步提升模型性能
 \end{itemize}
 %\vspace{-1em}
@@ -6785,7 +6785,7 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
 搜索空间 & 与输入的源语句法树 & 所有推导$D$ \\
         & 兼容的推导$D_{\textrm{tree}}$ & \\ \hline
 适用模型 & 树到串、树到树 & 所有句法模型 \\ \hline
-解码算法 & chart解码 &  CYK + 规则二叉化 \\ \hline
+解码算法 & chart解码 &  CKY + 规则二叉化 \\ \hline
 速度 & 快 & 一般较慢

 \end{tabular}
@@ -7358,7 +7358,7 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  基于串的解码 - CYK + 规则二叉化
+%%%  基于串的解码 - CKY + 规则二叉化
 \begin{frame}{基于串的解码 - CKY + 规则二叉化}

 \begin{itemize}

--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -5031,6 +5031,10 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
 \node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([xshift=1em]e1.east) {\scriptsize{$\textbf{e}_2$}};
 \node [anchor=west,inner sep=4pt] (sep5) at ([xshift=1em]e2.east) {\scriptsize{...}};
 \node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([xshift=1em]sep5.east) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
+\node [anchor=south] (wordseq) at ([yshift=-1.5em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {island}};

 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([xshift=-2em,yshift=1em]Lstm5.north) {\scriptsize{$\textbf{h}_1$}};
 \node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([xshift=1em]t1.east) {\scriptsize{$\textbf{h}_2$}};
@@ -5130,6 +5134,12 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
 \node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
 \node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
 \node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.6em]e2.south) {\footnotesize {upon}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
+\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};

 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
@@ -5214,6 +5224,12 @@ GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
 \node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
 \node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
 \node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};
+\node [anchor=south] (word1) at ([yshift=-1.5em]e1.south) {\footnotesize {Once}};
+\node [anchor=south] (word2) at ([yshift=-1.7em]e2.south) {\footnotesize {[MASK]}};
+\node [anchor=south] (word3) at ([yshift=-1.5em]e3.south) {\footnotesize {a}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e4.south) {\footnotesize {time}};
+\node [anchor=south] (wordseq) at ([yshift=-2.0em]sep5.south) {\footnotesize{...}};
+\node [anchor=south] (word4) at ([yshift=-1.5em]e5.south) {\footnotesize {island}};

 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
 \node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};

--- a/Section06-Neural-Machine-Translation/section06.tex
+++ b/Section06-Neural-Machine-Translation/section06.tex
@@ -520,12 +520,12 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

                \node[rnnnode,fill=blue!30!white,right=\base of rnn3] (rnn4) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn4] (emb4) {};
-                \node[wordnode,below=0pt of emb4] (word4) {EOS};
+                \node[wordnode,below=0pt of emb4] (word4) {$\langle$eos$\rangle$};
                \draw[-latex'] (emb4.north) to (rnn4.south);
                \draw[-latex'] (rnn3.east) to (rnn4.west);
            }
            \visible<4->{
-                \draw[decoration={mirror,brace},decorate] (word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
+                \draw[decoration={mirror,brace},decorate] ([yshift=-0.2em]word1.south west) to node [auto,anchor=north,align=center] {编码器} ([yshift=-0.2em]word4.south east);
            }
            \visible<5->{
                \node[rnnnode,fill=purple] (repr) at (rnn4) {};
@@ -535,7 +535,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \visible<6->{
                \node[rnnnode,fill=blue!30!white,right=\base of rnn4] (rnn5) {};
                \node[rnnnode,fill=green!30!white,below=\base of rnn5] (emb5) {};
-                \node[wordnode,below=0pt of emb5] (word5) {SOS};
+                \node[wordnode,below=0pt of emb5] (word5) {$\langle$sos$\rangle$};
                \draw[-latex'] (emb5.north) to (rnn5.south);
                \draw[-latex'] (rnn4.east) to (rnn5.west);
                \node[rnnnode,fill=red!30!white,above=\base of rnn5] (softmax1) {};
@@ -578,7 +578,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
                \node[wordnode,anchor=base] (word8) at (\XCoord,\YCoord) {fine};
                \ExtractX{$(emb8)$}
                \ExtractY{$(out1.base)$}
-                \node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {EOS};
+                \node[wordnode,anchor=base] (out4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};
                \draw[-latex'] (emb8.north) to (rnn8.south);
                \draw[-latex'] (rnn7.east) to (rnn8.west);
                \draw[-latex'] (rnn8.north) to (softmax4.south);
@@ -720,7 +720,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-1em]node12.south) {\tiny{}};
 \node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-1em]node13.south) {\tiny{}};
 \node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-1em]node14.south) {\tiny{}};
-\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
+\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$sos$>$}};
 \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{让}};
 \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
 \node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
@@ -1072,10 +1072,10 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 				\draw[-latex'] (enc3.north) .. controls +(north:0.3\base) and +(east:\base) .. (bridge) .. controls +(west:2.7\base) and +(west:0.3\base) .. (dec1.west);
 				
 				\visible<2->{
-				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_i$}};
+				\node [anchor=east] (line1) at ([xshift=-3em,yshift=0.5em]softmax1.west) {\scriptsize{基于RNN的隐层状态$\textbf{s}_j$}};
 				\node [anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {\scriptsize{预测目标词的概率}};
 				\node [anchor=north west] (line3) at ([yshift=0.3em]line2.south west) {\scriptsize{通常，用Softmax函数}};
-				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_i|...)$}};
+				\node [anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {\scriptsize{实现 $\textrm{P}(y_j|...)$}};
 				}
 				
 				\visible<3->{
@@ -1833,7 +1833,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
                \node[rnnnode,minimum height=0.5\base,fill=red!30!white,anchor=south] (softmax\x) at ([yshift=0.5\base]dec\x.north) {};

            % Decoder input words
-            \node[wordnode,below=0pt of demb1] (decwordin) {EOS};
+            \node[wordnode,below=0pt of demb1] (decwordin) {$\langle$sos$\rangle$};
            \ExtractX{$(demb2.south)$}
            \ExtractY{$(decwordin.base)$}
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Do};
@@ -1890,7 +1890,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
+            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};

            % Connections
            \draw[-latex'] (init.east) to (enc1.west);
@@ -1971,7 +1971,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,below=0pt of eemb7] () {怎么};
            \node[wordnode,below=0pt of eemb8] () {走};
            \node[wordnode,below=0pt of eemb9] () {吗};
-            \node[wordnode,below=0pt of eemb10] () {EOS};
+            \node[wordnode,below=0pt of eemb10] () {$\langle$eos$\rangle$};

            % RNN Decoder
            \foreach \x in {1,2,...,10}
@@ -2041,7 +2041,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {Station};
            \ExtractX{$(softmax10.north)$}
            \ExtractY{$(decwordout.base)$}
-            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {EOS};
+            \node[wordnode,anchor=base] () at (\XCoord,\YCoord) {$\langle$eos$\rangle$};

            % Connections
            \draw[-latex'] (init1.east) to (enc11.west);
@@ -2187,7 +2187,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \begin{itemize}
 \item 在注意力机制中，每个目标语单词的生成会使用一个动态的源语表示，而非一个统一的固定表示
    \begin{itemize}
-    \item 这里$\textbf{C}_i$表示第$i$个目标语单词所使用的源语表示
+    \item 这里$\textbf{C}_j$表示第$j$个目标语单词所使用的源语表示
    \end{itemize}
 \end{itemize}

@@ -2286,15 +2286,15 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 %%%------------------------------------------------------------------------------------------------------------
 %%% C_i的定义
-\begin{frame}{上下文向量$\textbf{C}_i$}
+\begin{frame}{上下文向量$\textbf{C}_j$}
 \begin{itemize}
-\item 对于目标语位置$i$，$\textbf{C}_i$是目标语$i$使用的上下文向量
+\item 对于目标语位置$j$，$\textbf{C}_j$是目标语$j$使用的上下文向量
 	\begin{itemize}
-	\item $\textbf{h}_j$表示编码器第$j$个位置的隐层状态
-	\item $\textbf{s}_i$表示解码器第$i$个位置的隐层状态
-	\item<2-> $\alpha_{i,j}$表示注意力权重，表示目标语第$i$个位置与源语第$j$个位置之间的相关性大小
-	\item<2-> $a(\cdot)$表示注意力函数，计算$\textbf{s}_{i-1}$和$\textbf{h}_j$之间的相关性
-	\item<3-> $\textbf{C}_i$是所有源语编码表示$\{\textbf{h}_j\}$的加权求和，权重为$\{\alpha_{i,j}\}$
+	\item $\textbf{h}_i$表示编码器第$i$个位置的隐层状态
+	\item $\textbf{s}_j$表示解码器第$j$个位置的隐层状态
+	\item<2-> $\alpha_{i,j}$表示注意力权重，表示目标语第$j$个位置与源语第$i$个位置之间的相关性大小
+	\item<2-> $a(\cdot)$表示注意力函数，计算$\textbf{s}_{j-1}$和$\textbf{h}_i$之间的相关性
+	\item<3-> $\textbf{C}_j$是所有源语编码表示$\{\textbf{h}_i\}$的加权求和，权重为$\{\alpha_{i,j}\}$
 	\end{itemize}
 \end{itemize}

@@ -2306,23 +2306,23 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h1) at (0,0) {\scriptsize{$\textbf{h}_1$}};
 \node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h2) at ([xshift=1em]h1.east) {\scriptsize{$\textbf{h}_2$}};
 \node [anchor=west,inner sep=0pt,minimum width=3em] (h3) at ([xshift=0.5em]h2.east) {\scriptsize{...}};
-\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_n$}};
+\node [anchor=west,draw,fill=red!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (h4) at ([xshift=0.5em]h3.east) {\scriptsize{$\textbf{h}_m$}};

 \node [anchor=south,circle,minimum size=1.0em,draw,ublue,thick] (sum) at ([yshift=2em]h2.north east) {};
 \draw [thick,-,ublue] (sum.north) -- (sum.south);
 \draw [thick,-,ublue] (sum.west) -- (sum.east);

-\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{i-1}$}};
-\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{i}$}};
+\node [anchor=south,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th1) at ([yshift=2em,xshift=-1em]sum.north west) {\scriptsize{$\textbf{s}_{j-1}$}};
+\node [anchor=west,draw,fill=green!20!white,inner sep=3pt,minimum width=2em,minimum height=1.2em] (th2) at ([xshift=2em]th1.east) {\scriptsize{$\textbf{s}_{j}$}};

-\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{i,1}$}};
-\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{i,2}$}};
-\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) ..  (sum.-10) node [pos=0.1,left] (alphan) {\tiny{$\alpha_{i,n}$}};
+\draw [->] (h1.north) .. controls +(north:0.8) and +(west:1) ..  (sum.190) node [pos=0.3,left] {\tiny{$\alpha_{1,j}$}};
+\draw [->] (h2.north) .. controls +(north:0.6) and +(220:0.2) ..  (sum.220) node [pos=0.2,right] {\tiny{$\alpha_{2,j}$}};
+\draw [->] (h4.north) .. controls +(north:0.8) and +(east:1) ..  (sum.-10) node [pos=0.1,left] (alphan) {\tiny{$\alpha_{m,j}$}};

 \draw [->] ([xshift=-1.5em]th1.west) -- ([xshift=-0.1em]th1.west);
 \draw [->] ([xshift=0.1em]th1.east) -- ([xshift=-0.1em]th2.west);
 \draw [->] ([xshift=0.1em]th2.east) -- ([xshift=1.5em]th2.east);
-\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{i}$}};
+\draw [->] (sum.north) .. controls +(north:0.8) and +(west:0.2) ..  ([yshift=-0.4em,xshift=-0.1em]th2.west) node [pos=0.2,right] (ci) {\scriptsize{$\textbf{C}_{j}$}};

 \node [anchor=south,inner sep=1pt] (output) at ([yshift=0.8em]th2.north) {\tiny{输出层}};
 \draw [->] ([yshift=0.1em]th2.north) -- ([yshift=-0.1em]output.south);
@@ -2334,11 +2334,11 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node [anchor=north] (enc42) at ([yshift=0.5em]enc4.south) {\tiny{(位置$4$)}};

 \visible<2->{
-\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_i = \sum_{j} \alpha_{i,j} \textbf{h}_j \ \ $};
+\node [anchor=west] (math1) at ([xshift=5em,yshift=1em]th2.east) {$\textbf{C}_j = \sum_{i} \alpha_{i,j} \textbf{h}_i \ \ $};
 }
 \visible<3->{
-\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{j'} \exp(\beta_{i,j'})}$};
-\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{i-1}, \textbf{h}_j)$};
+\node [anchor=north west] (math2) at ([yshift=-2em]math1.south west) {$\alpha_{i,j} = \frac{\exp(\beta_{i,j})}{\sum_{i'} \exp(\beta_{i',j})}$};
+\node [anchor=north west] (math3) at ([yshift=-0em]math2.south west) {$\beta_{i,j} = a(\textbf{s}_{j-1}, \textbf{h}_i)$};
 }

 \begin{pgfonlayer}{background}
@@ -2418,7 +2418,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
 \node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
 \node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
-\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
+\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};

 % target
 \node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+7.5*0.5*\hnode) {\scriptsize{你}};
@@ -2428,7 +2428,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\
 \node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{学}};
 \node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{到}};
 \node[tgtnode] (tgt7) at ([yshift=-0.5\hnode]tgt6.north east) {\scriptsize{?}};
-\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{EOS}};
+\node[tgtnode] (tgt8) at ([yshift=-0.5\hnode]tgt7.north east) {\scriptsize{$\langle$eos$\rangle$}};

 \end{scope}

@@ -2464,7 +2464,7 @@ NLP问题的隐含结构假设 & 无隐含结构假设，端到端学习 \\

 \visible<3->{
 % coverage score formula node
-\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_i$所对应的源语言词的权重是不同的}};
+\node [anchor=north west] (formula) at ([xshift=-0.3\hnode,yshift=-1.5\hnode]attn11.south) {\small{不同$\textbf{C}_j$所对应的源语言词的权重是不同的}};
 \node [anchor=north west] (example) at (formula.south west) {\footnotesize{$\textbf{C}_2=0.4 \times \textbf{h}(\textrm{``你''}) + 0.4 \times \textbf{h}(\textrm{``什么''}) +$}};
 \node [anchor=north west] (example2) at ([yshift=0.4em]example.south west) {\footnotesize{$\ \ \ \ \ \ \ \ 0 \times \textbf{h}(\textrm{``都''}) + 0.1 \times \textbf{h}(\textrm{`` 没''}) + ..$}};
 }
@@ -2526,7 +2526,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \item 再来看一下注意力权重的定义。这个过程实际上是对$a(\cdot,\cdot)$做指数归一化：\\
 \vspace{-0.3em}
 \begin{displaymath}
-\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{i-1}, \textbf{h}_j))}{\sum_{j'} \exp(a(\textbf{s}_{i-1}, \textbf{h}_{j'}))}
+\alpha_{i,j} = \frac{\exp(a(\textbf{s}_{j-1}, \textbf{h}_i))}{\sum_{i'} \exp(a(\textbf{s}_{j-1}, \textbf{h}_{i'}))}
 \end{displaymath}

 \item<2-> 注意力函数$a(\textbf{s},\textbf{h})$的目的是捕捉$\textbf{s}$和$\textbf{h}$之间的\alert{相似性}，这也可以被看作是目标语表示和源语言表示的一种``统一化''，即把源语言和目标语表示在同一个语义空间，进而语义相近的内容有更大的相似性。\visible<3->{定义$a(\textbf{s},\textbf{h})$的方式：}
@@ -2572,7 +2572,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
                ymin=-0.5,ymax=5.5,
                xmin=-0.5,xmax=2.5,
                ytick={0,1,...,5},
-                yticklabels={The,New,York,Times,comments,EOS},
+                yticklabels={The,New,York,Times,comments,$\langle$eos$\rangle$},
                yticklabel style={font=\scriptsize},
                xtick={0,1,2},
                xticklabels={纽约时报,发表,评论},
@@ -2593,7 +2593,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
                ymin=-0.5,ymax=5.5,
                xmin=-0.5,xmax=3.5,
                ytick={0,1,...,5},
-                yticklabels={I,came,to,this,world,EOS},
+                yticklabels={I,came,to,this,world,$\langle$eos$\rangle$},
                yticklabel style={font=\scriptsize},
                xtick={0,1,2,3},
                xticklabels={我,来到,这个,世界},
@@ -2715,7 +2715,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 %%% 如何进一步理解注意力机制 - 回到机器翻译任务
 \begin{frame}{重新解释注意力机制(续)}
 \begin{itemize}
-\item 回到机器翻译，如果把目标语状态$\textbf{s}_{i-1}$看做query，而把源语言所有位置的最上层RNN表示$\textbf{h}_{j}$看做{\color{ugreen} \textbf{key}}和{\color{red} \textbf{value}}
+\item 回到机器翻译，如果把目标语状态$\textbf{s}_{j-1}$看做query，而把源语言所有位置的最上层RNN表示$\textbf{h}_{i}$看做{\color{ugreen} \textbf{key}}和{\color{red} \textbf{value}}
 \end{itemize}

 \vspace{-1.5em}
@@ -3084,7 +3084,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
                    % step 6
                    \visible<6->{
                        \node[rnnnode] (rnn34) at ([xshift=2\base]rnn33) {};
-                        \node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {EOS};
+                        \node[wordnode,anchor=south] (o4) at ([yshift=\base]rnn34.north) {$\langle$eos$\rangle$};
                        \draw[-latex'] (rnn33) to (rnn34);
                        \draw[-latex'] (rnn24) to (rnn34);
                        \draw[-latex'] (rnn34) to (o4);
@@ -3136,7 +3136,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \hat{\textbf{y}} = \argmax_{\textbf{y}} \log\textrm{P}(\textbf{y}|\textbf{x}) = \argmax_{\textbf{y}} \sum_{j=1}^{n} \log\textrm{P}(y_j|\textbf{y}_{<j}, \textbf{x})
 \end{displaymath}

-\item<2-> 由于$y_i$的生成需要依赖$y_{i-1}$，因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成
+\item<2-> 由于$y_j$的生成需要依赖$y_{j-1}$，因此无法同时生成$\{y_1,...,y_n\}$。常用的方法是自左向右逐个单词生成

 \end{itemize}

@@ -3156,7 +3156,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
 \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
 \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.8em]e2.south) {\tiny{...}};
-\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
+\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};

 \draw [->] (w1.north) -- ([yshift=-0.1em]e1.south);
 \draw [->] (w3.north) -- ([yshift=-0.1em]e3.south);
@@ -3202,7 +3202,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
 }
 \visible<4->{
-\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
+\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
 }
 \visible<7->{
 \node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{Have}};
@@ -3355,7 +3355,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [anchor=west,inner sep=2pt] (o4) at ([xshift=0.3em]o3.east) {\tiny{...}};
 }

-\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{EOS}};
+\node [wnode,anchor=north] (wt1) at ([yshift=-0.8em]t1.south) {\tiny{$\langle$sos$\rangle$}};

 \visible<6->{
 \node [wnode,anchor=north] (wt2) at ([yshift=-0.8em]t2.south) {\tiny{Have}};
@@ -3546,7 +3546,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
            % words
            \node[wnode,below=0pt of encemb1] (encword1) {你};
            \node[wnode,below=0pt of encemb2] (encword2) {什么};
-            \node[wnode,below=0pt of encemb4] (encword4) {EOS};
+            \node[wnode,below=0pt of encemb4] (encword4) {$\langle$eos$\rangle$};

            % connections
            \draw[-latex'] (enc11) to (enc12);
@@ -3645,7 +3645,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
            \node[rnnnode,fill=blue!20,above=\base of dec54] (softmax4) {};

            % words
-            \node[wnode,below=0pt of decemb1] (decinword1) {SOS};
+            \node[wnode,below=0pt of decemb1] (decinword1) {$\langle$sos$\rangle$};
            \node[wnode,below=0pt of decemb2] (decinword2) {Have};
            \node[wnode,below=0pt of decemb4] (decinword4) {?};

@@ -3655,7 +3655,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
            \node[wnode,anchor=base] (decoutword2) at (\XCoord,\YCoord) {you};
            \ExtractX{$(softmax4.north)$}
            \ExtractY{$(decoutword1.base)$}
-            \node[wnode,anchor=base] (decoutword4) at (\XCoord,\YCoord) {EOS};
+            \node[wnode,anchor=base] (decoutword4) at (\XCoord,\YCoord) {$\langle$eos$\rangle$};

            % connections
            \draw[-latex'] (dec11) to (dec12);
@@ -3810,7 +3810,7 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
 \node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
 \node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
-\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
+\node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$\langle$sos$\rangle$}};
 \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{让}};
 \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
 \node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};
@@ -4100,9 +4100,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
 \node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
 \node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
-\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
 \node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
-\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};

 \draw [->] (sa2.north) -- (res3.south);
 \draw [->] (res3.north) -- (ed1.south);
@@ -4127,6 +4127,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=ugreen!70] [fit = (sa1) (res1) (ffn1) (res2)] (box0) {};
 \node [rectangle,inner sep=0.7em,rounded corners=1pt,very thick,dotted,draw=red!60] [fit = (sa2) (res3) (res5)] (box1) {};

+\node [ugreen,font=\scriptsize] (count) at ([xshift=-1.5em,yshift=-1em]encoder.south) {$6\times$};
+\node [red,font=\scriptsize] (count) at ([xshift=10.8em,yshift=0em]decoder.south) {$\times 6$};
+
 \end{scope}
 \end{tikzpicture}
 \end{center}
@@ -4180,9 +4183,9 @@ $\textrm{``you''} = \argmax_{y_2} \textrm{P}(y_2|\textbf{s}_1, y_1)$ & $\textrm{
 \node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
 \node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
 \node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
-\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
 \node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
-\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};

 \draw [->] (sa2.north) -- (res3.south);
 \draw [->] (res3.north) -- (ed1.south);
@@ -4414,9 +4417,9 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
 \node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
 \node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
-\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
 \node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
-\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};

 \draw [->] (sa2.north) -- (res3.south);
 \draw [->] (res3.north) -- (ed1.south);
@@ -4591,7 +4594,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
 \node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
 \node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
-\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
+\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};

 % target
 \node[tgtnode] (tgt1) at (-6.0*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
@@ -4599,7 +4602,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
 \node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
 \node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
-\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};

 \node [rounded corners=0.3em,fill=yellow!30] (qk) at ([xshift=2.5em,yshift=5em]a55.north) {\large{$\frac{QK^{T}}{\sqrt{d_k}}$}};
 \node [rounded corners=0.3em,anchor=west] (add) at ([xshift=0.1em]qk.east) {\large{+}};
@@ -4630,7 +4633,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node[srcnode] (src3) at ([xshift=0.5\hnode]src2.south west) {\scriptsize{learned}};
 \node[srcnode] (src4) at ([xshift=0.5\hnode]src3.south west) {\scriptsize{nothing}};
 \node[srcnode] (src5) at ([xshift=0.5\hnode]src4.south west) {\scriptsize{?}};
-\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{EOS}};
+\node[srcnode] (src6) at ([xshift=0.5\hnode]src5.south west) {\scriptsize{$\langle$eos$\rangle$}};

 % target
 \node[tgtnode] (tgt1) at (5.4*0.5*\hnode,-1.05*\hnode+5.5*0.5*\hnode) {\scriptsize{Have}};
@@ -4638,7 +4641,7 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node[tgtnode] (tgt3) at ([yshift=-0.5\hnode]tgt2.north east) {\scriptsize{learned}};
 \node[tgtnode] (tgt4) at ([yshift=-0.5\hnode]tgt3.north east) {\scriptsize{nothing}};
 \node[tgtnode] (tgt5) at ([yshift=-0.5\hnode]tgt4.north east) {\scriptsize{?}};
-\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{EOS}};
+\node[tgtnode] (tgt6) at ([yshift=-0.5\hnode]tgt5.north east) {\scriptsize{$\langle$eos$\rangle$}};

 \node [rounded corners=0.3em,anchor=west,fill=green!30] (softmax) at ([xshift=-6em]left.east) {\large{Softmax}};

@@ -4800,9 +4803,9 @@ PE_{(pos,2i+1)} = cos(pos/10000^{2i/d_{model}})
 \node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
 \node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
 \node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
-\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
 \node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
-\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};

 \draw [->] (sa2.north) -- (res3.south);
 \draw [->] (res3.north) -- (ed1.south);
@@ -5030,9 +5033,9 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \node [outputnode,anchor=south] (o1) at ([yshift=1em]res5.north) {\tiny{$\textbf{Output layer}$}};
 \node [inputnode,anchor=north west] (input2) at ([yshift=-1em]sa2.south west) {\tiny{$\textbf{Embedding}$}};
 \node [posnode,anchor=north east] (pos2) at ([yshift=-1em]sa2.south east) {\tiny{$\textbf{Postion}$}};
-\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$SOS$>$ I  am  fine}$}};
+\node [anchor=north] (outputs) at ([yshift=-3em]sa2.south) {\tiny{$\textbf{解码器输入: $<$sos$>$ I  am  fine}$}};
 \node [anchor=east] (decoder) at ([xshift=-1em,yshift=-1.5em]o1.west) {\scriptsize{\textbf{解码器}}};
-\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$EOS$>$ }$}};
+\node [anchor=north] (decoutputs) at ([yshift=1.5em]o1.north) {\tiny{$\textbf{解码器输出: I  am  fine $<$eos$>$ }$}};

 \draw [->] (sa2.north) -- (res3.south);
 \draw [->] (res3.north) -- (ed1.south);
@@ -5170,7 +5173,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
 \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
 \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{好}};
-\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
+\node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
 \node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
 \node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};

@@ -5212,7 +5215,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
 \node [anchor=south,fill=black!5!white,minimum height=1.1em,minimum width=13em,inner sep=2pt,rounded corners=1pt,draw] (loss) at ([xshift=1.8em,yshift=1em]o2.north) {\scriptsize{\textbf{Cross Entropy Loss}}};
 }
 \visible<3->{
-\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
+\node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
 \node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
 \node [anchor=north,inner sep=2pt] (wt3) at ([yshift=-0.8em]t3.south) {\tiny{are}};
 \node [anchor=north,inner sep=2pt] (wt4) at ([yshift=-0.8em]t4.south) {\tiny{you}};
@@ -5413,7 +5416,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    \node [rnnnode,anchor=west,fill=green!20] (e3) at ([xshift=1em]e2.east) {\tiny{$e_x()$}};
    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-0.6em]e1.south) {\tiny{你}};
    \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-0.6em]e2.south) {\tiny{好}};
-    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{EOS}};
+    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-0.6em]e3.south) {\tiny{$\langle$eos$\rangle$}};
    %\node [anchor=south] (dot1) at ([xshift=0.4em,yshift=-0.7em]h1.south) {\tiny{...}};
    %\node [anchor=south] (dot2) at ([xshift=-0.4em,yshift=-0.7em]h3.south) {\tiny{...}};

@@ -5473,7 +5476,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    %\node [anchor=west,inner sep=2pt] (o5) at ([xshift=0.3em]o4.east) {\tiny{...}};
    }
    \visible<4->{
-    \node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{EOS}};
+    \node [anchor=north,inner sep=2pt] (wt1) at ([yshift=-0.6em]t1.south) {\tiny{$\langle$sos$\rangle$}};
    }
    \visible<6->{
    \node [anchor=north,inner sep=2pt] (wt2) at ([yshift=-0.6em]t2.south) {\tiny{How}};
@@ -5497,7 +5500,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    \visible<8->{
    \node [anchor=center,inner sep=2pt] (wo3) at ([yshift=1.2em]o3.north) {\tiny{you}};
    \node [anchor=south,inner sep=2pt] (wos3) at (wo3.north) {\tiny{\textbf{[step 3]}}};
-    \node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{EOS}};
+    \node [anchor=center,inner sep=2pt] (wo4) at ([yshift=1.2em]o4.north) {\tiny{$\langle$eos$\rangle$}};
    \node [anchor=south,inner sep=2pt] (wos4) at (wo4.north) {\tiny{\textbf{[step 4]}}};
    }

@@ -5606,7 +5609,7 @@ x_{l+1} = x_l+\mathcal{F}(x_l)
    \node [anchor=north,rnnnode,fill=blue!30!white] (e2) at ([yshift=-2em]node12.south) {\tiny{}};
    \node [anchor=north,rnnnode,fill=blue!30!white] (e3) at ([yshift=-2em]node13.south) {\tiny{}};
    \node [anchor=north,rnnnode,fill=blue!30!white] (e4) at ([yshift=-2em]node14.south) {\tiny{}};
-    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$s$>$}};
+    \node [anchor=north,inner sep=2pt] (w1) at ([yshift=-1em]e1.south) {\tiny{$<$sos$>$}};
    \node [anchor=north,inner sep=2pt] (w2) at ([yshift=-1em]e2.south) {\tiny{让}};
    \node [anchor=north,inner sep=2pt] (w3) at ([yshift=-1em]e3.south) {\tiny{我们}};
    \node [anchor=north,inner sep=2pt] (w4) at ([yshift=-1em]e4.south) {\tiny{开始}};