new pages of hiero rules

3748bd52 · xiaotong · abfdaf38 · 3748bd52 · 3748bd52
Commit 3748bd52 authored Dec 16, 2019 by xiaotong
--- a/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
@@ -147,7 +147,7 @@
    \end{center}
    
 \vspace{0.5em}
-\item 简单使用短语和$n$-gram语言模型无法处理长距离的调序
+\item<2-> 简单使用短语和$n$-gram语言模型无法处理长距离的调序
    \begin{itemize}
    \item 引入独立的调序模型，比如简单的基于距离的调序
    \item 当然，也可以设计更加复杂的调序模型
@@ -159,17 +159,68 @@
 %%%  短语系统的问题 - 一个实例
 \begin{frame}{基于短语的方法的不足 - 一个实例}
 \begin{itemize}
-\item 一个短语翻译不成功的例子
+\item 一个短语翻译不成功的例子(Chiang, 2015)
 \end{itemize}

+\vspace{-1.5em}
 \begin{center}
 \begin{tikzpicture}
 \begin{scope}
 \node [anchor=east] (shead) at (0,0) {源语:};
+\node [anchor=west] (swords) at (shead.east) {澳洲\ \ 是\ \ 与\ \ 北韩\ \ 有\ \ 邦交\ \ 的\ \ 少数\ \ 国家\ \ 之一};
+\node [anchor=north east] (thead) at ([yshift=-0.8em]shead.south east) {短语系统:};
+\node [anchor=west] (twords) at (thead.east) {Australia is diplomatic relations with North Korea};
+\node [anchor=north west] (twords2) at ([yshift=-0.2em]twords.south west) {is one of the few countries};
+\node [anchor=north east] (rhead) at ([yshift=-2.2em]thead.south east) {参考译文:};
+\node [anchor=west] (rwords) at (rhead.east) {Australia is one of the few countries that have};
+\node [anchor=north west] (rwords2) at ([yshift=-0.2em]rwords.south west) {diplomatic relations with North Korea};
+
+\begin{pgfonlayer}{background}
+\visible<2->{
+\draw[fill=red!20,draw=white] ([xshift=-5.6em]twords.north) rectangle ([xshift=11em]twords.south);
+\draw[fill=blue!20,draw=white] ([xshift=-4.8em]twords2.north) rectangle ([xshift=6.3em]twords2.south);
+\node [anchor=south east,inner sep=1pt,fill=black] (l1) at ([xshift=11em]twords.south) {\tiny{{\color{white} 1}}};
+\node [anchor=south east,inner sep=1pt,fill=black] (l2) at ([xshift=6.3em]twords2.south) {\tiny{{\color{white} 2}}};
+}
+\end{pgfonlayer}
+    
 \end{scope}
 \end{tikzpicture}
 \end{center}

+\begin{itemize}
+\item<2-> 从短语系统翻译结果可以看出
+	\begin{itemize}
+	\item diplomatic relations with North Korea能够进行正确调序
+	\item one of the few countries能够进行正确调序
+	\item \textbf{但是}，两个短语（\tikz{\node[fill=black,inner sep=2pt] {\tiny{{\color{white} 1}}};} 和 \tikz{\node[fill=black,inner sep=2pt] {\tiny{{\color{white} 2}}};}）没有正确调序 - 怎么办？
+	\end{itemize}
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%%  引入层次短语规则
+\begin{frame}{引入新的翻译单元}
+\begin{itemize}
+\item 显然，通过由连续单词构成的短语拼装出理想的译文需要比较复杂的机制。但是，语言是有``结构''的，我们可以用一种新的方式描述翻译：
+\begin{displaymath}
+<\textrm{与}\ X_1\ \textrm{有}\ X_2,\ \ \textrm{have}\ X_2\ \textrm{with}\ X_1>
+\end{displaymath}
+
+这里$X_1$和$X_2$表示两个变量，它们可以被其它连续词串替换。这样，上面这种源语言和目标语言的对应就构成了一种翻译规则或模版，相当于把``$\textrm{与}\ X_1\ \textrm{有}\ X_2$''翻译为``$\textrm{have}\ X_2\ \textrm{with}\ X_1$''，调序信息就隐含在变量的编号里
+
+\vspace{0.5em}
+\item<2-> 类似的，可以写出很多这样的翻译规则
+
+\vspace{-1.5em}
+\begin{eqnarray}
+<X_1\ \textrm{是}\ X_2, & & X_1\ \textrm{is}\ X_2> \nonumber \\
+<X_1\ \textrm{之一},& & \textrm{one\ \ of\ \ }X_1> \nonumber \\
+<X_1\ \textrm{的}\ X_2,& & X_2\ \textrm{that\ \ have\ \ }X_1> \nonumber
+\end{eqnarray}
+
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -1364,7 +1364,7 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  短语系统的问题
+%%%  短语系统的问题 - 数据稀疏和无法处理长距离依赖
 \begin{frame}{基于短语的方法的不足}
 \begin{itemize}
 \item 短语可以很好的捕捉词语之间的局部搭配和调序，但是长距离依赖需要更长的短语
@@ -1372,25 +1372,95 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
    \item 实践中发现使用超过长度3的短语作用不大
    \item 短语非常稀疏，包含多个词的短语大多非常低频
    \end{itemize}
-    
+
+    \vspace{0.5em}
+    \begin{center}
    \begin{tabular}{l | r}
-    短语(中文) & 训练数据中频次 \hline \\
+    短语(中文) & 训练数据中出现频次 \\ \hline
    包含 & 3341 \\
    包含 多个 & 213 \\
+    包含 多个 词 & 12 \\
+    包含 多个 词 的 & 8 \\
+    包含 多个 词 的 短语 & 0 \\
+    包含 多个 词 的 短语 大多 & 0
    \end{tabular}
+    \end{center}
+    
+\vspace{0.5em}
+\item<2-> 简单使用短语和$n$-gram语言模型无法处理长距离的调序
+    \begin{itemize}
+    \item 引入独立的调序模型，比如简单的基于距离的调序
+    \item 当然，也可以设计更加复杂的调序模型
+    \end{itemize}
 \end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  可能的解决方法
-\begin{frame}{需要更好的建模语言的层次结构}
-% David Chiang ACL2005的论文
+%%%  短语系统的问题 - 一个实例
+\begin{frame}{基于短语的方法的不足 - 一个实例}
+\begin{itemize}
+\item 一个短语翻译不成功的例子(Chiang, 2015)
+\end{itemize}
+
+\vspace{-1.5em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=east] (shead) at (0,0) {源语:};
+\node [anchor=west] (swords) at (shead.east) {澳洲\ \ 是\ \ 与\ \ 北韩\ \ 有\ \ 邦交\ \ 的\ \ 少数\ \ 国家\ \ 之一};
+\node [anchor=north east] (thead) at ([yshift=-0.8em]shead.south east) {短语系统:};
+\node [anchor=west] (twords) at (thead.east) {Australia is diplomatic relations with North Korea};
+\node [anchor=north west] (twords2) at ([yshift=-0.2em]twords.south west) {is one of the few countries};
+\node [anchor=north east] (rhead) at ([yshift=-2.2em]thead.south east) {参考译文:};
+\node [anchor=west] (rwords) at (rhead.east) {Australia is one of the few countries that have};
+\node [anchor=north west] (rwords2) at ([yshift=-0.2em]rwords.south west) {diplomatic relations with North Korea};
+
+\begin{pgfonlayer}{background}
+\visible<2->{
+\draw[fill=red!20,draw=white] ([xshift=-5.6em]twords.north) rectangle ([xshift=11em]twords.south);
+\draw[fill=blue!20,draw=white] ([xshift=-4.8em]twords2.north) rectangle ([xshift=6.3em]twords2.south);
+\node [anchor=south east,inner sep=1pt,fill=black] (l1) at ([xshift=11em]twords.south) {\tiny{{\color{white} 1}}};
+\node [anchor=south east,inner sep=1pt,fill=black] (l2) at ([xshift=6.3em]twords2.south) {\tiny{{\color{white} 2}}};
+}
+\end{pgfonlayer}
+    
+\end{scope}
+\end{tikzpicture}
+\end{center}
+
+\begin{itemize}
+\item<2-> 从短语系统翻译结果可以看出
+	\begin{itemize}
+	\item diplomatic relations with North Korea能够进行正确调序
+	\item one of the few countries能够进行正确调序
+	\item \textbf{但是}，两个短语（\tikz{\node[fill=black,inner sep=2pt] {\tiny{{\color{white} 1}}};} 和 \tikz{\node[fill=black,inner sep=2pt] {\tiny{{\color{white} 2}}};}）没有正确调序 - 怎么办？
+	\end{itemize}
+\end{itemize}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
-%%%  基于句法的翻译系统流程
-\begin{frame}{基于同步文法/句法的机器翻译流程}
-% Manual第42页
+%%%  引入层次短语规则
+\begin{frame}{引入新的翻译单元}
+\begin{itemize}
+\item 显然，通过由连续单词构成的短语拼装出理想的译文需要比较复杂的机制。但是，语言是有``结构''的，我们可以用一种新的方式描述翻译：
+\begin{displaymath}
+<\textrm{与}\ X_1\ \textrm{有}\ X_2,\ \ \textrm{have}\ X_2\ \textrm{with}\ X_1>
+\end{displaymath}
+
+这里，$X_1$和$X_2$表示变量，源语和目标语相同的变量表示对应关系，变量可以被其它连续词串替换。这样，这种源语言和目标语言的对应构成了一种翻译规则或模版，相当于把``$\textrm{与}\ X_1\ \textrm{有}\ X_2$''翻译为``$\textrm{have}\ X_2\ \textrm{with}\ X_1$''，调序信息就隐含在变量的编号里
+
+\vspace{0.5em}
+\item<2-> 类似的，可以写出很多这样的翻译规则
+
+\vspace{-1.5em}
+\begin{eqnarray}
+<X_1\ \textrm{是}\ X_2, & & X_1\ \textrm{is}\ X_2> \nonumber \\
+<X_1\ \textrm{之一},& & \textrm{one\ \ of\ \ }X_1> \nonumber \\
+<X_1\ \textrm{的}\ X_2,& & X_2\ \textrm{that\ \ have\ \ }X_1> \nonumber
+\end{eqnarray}
+
+\end{itemize}
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
@@ -1409,6 +1479,12 @@ $d$是一个$(\textbf{s},\textbf{t})$上基于短语的翻译推导，$\textrm{P
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%%  基于句法的翻译系统流程
+\begin{frame}{基于同步文法/句法的机器翻译流程}
+% Manual第42页
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{层次短语规则及翻译特征}

 %%%------------------------------------------------------------------------------------------------------------