Commit 6217af1c by 孟霞

fig-chapter7

parents 2b196a44 9f6e49e0
......@@ -77,8 +77,8 @@
\end{scope}
{
\draw[<->,dotted,thick,red] (example2.east)..controls +(east:0.6) and +(west:0.6)..(c1.west) node[pos=0.9,left,xshift=0.6em,yshift=0.4em] (simexample) {\color{red}{\tiny\sffamily\bfseries{相似实例}}};
\draw[<->,dotted,thick,red] ([xshift=-0.2em]example2part2.east)..controls +(east:0.5) and +(west:0.5)..(e1.west);
\draw[<->,dotted,thick,red] (example2.east)..controls +(east:1.3) and +(west:1.3)..(c1.west) node[pos=0.9,left,xshift=0.6em,yshift=0.4em] (simexample) {\color{red}{\tiny\sffamily\bfseries{相似实例}}};
\draw[<->,dotted,thick,red] ([xshift=-0.2em]example2part2.east)--([xshift=4.5em]example2part2.east);
}
{
......
......@@ -8,12 +8,12 @@
%----------------------------------------------------------------------------------------
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{chapter_head_1.pdf} % Chapter heading image
\chapterimage{fig-NEU-2.jpg} % Chapter heading image
\chapter{机器翻译简介}
\section{机器翻译的概念}\index{Chapter1.1}
\section{机器翻译的概念}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 广义上来讲,``翻译''是指把一个事物转化为另一个事物的过程。这个概念多使用在对序列的转化上,比如计算机程序的编译、自然语言文字翻译、蛋白质生物合成等。在程序编译中,高级语言编写的程序经过一系列的处理后转化为可执行的目标程序,这是一种从高级程序语言到低级程序语言的``翻译''。在人类语言的翻译中,一种语言文字通过人脑转化为另一种语言表达,这是一种自然语言的``翻译''。在蛋白质合成的第一步,RNA分子序列转化为特定的氨基酸序列,这是一种生物学遗传信息的``翻译''。甚至说,给上联对出下联、给一幅图片写出图片的主题等都可以被看作是``翻译''的过程。
\parinterval 从广义上来讲,``翻译''是指把一个事物转化为另一个事物的过程。这个概念多使用在对序列的转化上,比如,计算机程序的编译、自然语言文字翻译、蛋白质生物合成等。在程序编译中,高级语言编写的程序经过一系列的处理后转化为可执行的目标程序,这是一种从高级程序语言到低级程序语言的``翻译''。在人类语言的翻译中,一种语言文字通过人脑转化为另一种语言表达,这是一种自然语言的``翻译''。在蛋白质合成的第一步,RNA分子序列转化为特定的氨基酸序列,这是一种生物学遗传信息的``翻译''。甚至说给上联对出下联、给一幅图片写出图片的主题等都可以被看作是``翻译''的过程。
\vspace{0.5em}
%----------------------------------------------
% 图1.1
......@@ -25,11 +25,13 @@
\end{figure}
%-------------------------------------------
\parinterval 这里更加关注人类语言之间的翻译问题,即自然语言的翻译。如图\ref{fig:zh_en-example}所示,通过计算机可以将一段中文文字自动转化为英文文字,其中中文被称为{\small\bfnew{源语言}}(Source Language),英文被称为{\small\bfnew{目标语言}}(Target Language)
\parinterval 这里更加关注人类语言之间的翻译问题,即自然语言的翻译。如图\ref{fig:zh_en-example}所示,通过计算机可以将一段中文文字自动转化为英文文字,中文被称为{\small\bfnew{源语言}}\index{源语言}(Source Language)\index{Source Language},英文被称为{\small\bfnew{目标语言}}\index{目标语言}(Target Language)\index{Target Language}
\parinterval 一直以来,自然语言文字的翻译往往是由人工完成。让计算机像人一样进行翻译似乎还是电影中的桥段,因为很难想象人类语言的多样性和复杂性可以用计算机语言进行描述。但是时至今日,人工智能技术的发展已经大大超越了人类传统的认知,用计算机进行自动翻译也不再是一种想象,它已经深入到人们生活的很多方面,并发挥着重要作用。而这种由计算机进行自动翻译的过程也被称作{\small\bfnew{机器翻译}}(Machine Translation)。类似的,自动翻译、智能翻译、多语言自动转换等概念也是指同样的事情。如果将今天的机器翻译和人工翻译进行对比,可以发现机器翻译系统所生成的译文还并不完美,甚至有时翻译质量非常差。但是其优点在于速度快并且成本低,更为重要的是机器翻译系统可以从大量数据中不断学习和进化。人工翻译尽管精度很高,但是费时费力。当需要翻译大量的文本且精度要求不那么高时,比如海量数据的浏览型任务,机器翻译的优势就体现了出来。对于人工作业无法完成的事情,使用机器翻译可能只需花几个小时甚至几分钟就能完成。这就类似于拿着锄头耕地种庄稼和使用现代化机器作业之间的区别
\parinterval 一直以来,文字的翻译往往是由人工完成。让计算机像人一样进行翻译似乎还是电影中的桥段,因为人们很难想象语言的多样性和复杂性可以用计算机语言进行描述。但是时至今日,人工智能技术的发展已经大大超越了人类传统的认知,用计算机进行自动翻译也不再是一种梦想,它已经深入到人们生活的很多方面,并发挥着重要作用。而这种由计算机进行自动翻译的过程也被称作{\small\bfnew{机器翻译}}\index{机器翻译}(Machine Translation)\index{Machine Translation}。类似地,自动翻译、智能翻译、多语言自动转换等概念也是指同样的事情。如果将今天的机器翻译和人工翻译进行对比,可以发现机器翻译系统所生成的译文还并不完美,甚至有时翻译质量非常差,但是它的生成速度快且成本低廉,更为重要的是机器翻译系统可以从大量数据中不断学习和进化
\parinterval 实现机器翻译往往需要多个学科知识的融合,如数学、语言学、计算机科学、心理学等等。而最终呈现给使用者的是一套软件系统\ \dash\ 即机器翻译系统。通俗来讲,机器翻译系统就是一个可以在计算机上运行的软件工具,与人们使用的其他软件一样。只不过机器翻译系统是由``不可见的程序''组成,虽然这个系统非常复杂,但是呈现出来的展示形式却很简单,比如输入是待翻译的句子或文本,输出是译文句子或文本。
\parinterval 人工翻译尽管精度很高,但是费时费力。当需要翻译大量的文本且精度要求不那么高时,比如海量数据的浏览型任务,机器翻译的优势就体现出来。对于人工作业无法完成的事情,使用机器翻译可能只需花费几个小时甚至几分钟就能完成。这就类似于拿着锄头耕地种庄稼和使用现代化机器作业之间的区别。
\parinterval 实现机器翻译往往需要多个学科知识的融合,如数学、语言学、计算机科学、心理学等等。而最终呈现给使用者的是一套软件系统\ \dash\ 机器翻译系统。通俗来讲,机器翻译系统就是一个可以在计算机上运行的软件工具,与人们使用的其他软件一样,只不过机器翻译系统是由``不可见的程序''组成,虽然这个系统非常复杂。但是呈现出来的展示形式却很简单,比如输入是待翻译的句子或文本,输出是译文句子或文本。
%----------------------------------------------
% 图1.2
......@@ -41,39 +43,40 @@
\end{figure}
%-------------------------------------------
\parinterval 机器翻译的想法可以追溯到电子计算机产生之前,发展过程中也经历了多个范式的变迁,现代机器翻译系统大多是基于数据驱动的方法\ \dash\ 从数据中自动学习翻译知识,并运用这些知识对新的文本进行翻译。如图\ref{fig:Required-parts-of-MT}所示,机器翻译系统通常由两部分组成:
\parinterval 用机器进行翻译的想法可以追溯到电子计算机产生之前,发展过程中也经历了多个范式的变迁,现代机器翻译系统大多是基于数据驱动的方法\ \dash\ 从数据中自动学习翻译知识,并运用这些知识对新的文本进行翻译。如图\ref{fig:Required-parts-of-MT}所示,机器翻译系统通常由两部分组成:
\vspace{0.5em}
\begin{itemize}
\item {\small\bfnew{资源}}:如果把机器翻译系统比作一辆汽车,资源好比是可以使汽车运行的``汽油'',它包括很多内容,如翻译规则、双(单)语数据、知识库等等翻译知识,且这些``知识''都是计算机可读的。值得一提的是,如果没有翻译资源的支持,任何的机器翻译系统都无法运行起来。
\item {\small\bfnew{资源}}:如果把机器翻译系统比作一辆汽车,资源好比是可以使汽车运行的``汽油'',它包括很多内容,如翻译规则、双(单)语数据、知识库等等翻译知识,且这些``知识''都是计算机可读的。值得一提的是,如果没有翻译资源的支持,任何的机器翻译系统都无法运行起来。
\vspace{0.5em}
\item {\small\bfnew{系统}}:机器翻译算法的程序实现被称作系统,也就是机器翻译研究人员开发的软件。无论是翻译规则、翻译模板还是统计模型中的参数都需要通过机器翻译系统进行读取和使用。
\item {\small\bfnew{系统}}:机器翻译算法的程序实现被称作系统,也就是机器翻译研究人员开发的软件。无论是翻译规则、翻译模板还是统计模型中的参数都需要通过机器翻译系统进行读取和使用。
\end{itemize}
\vspace{0.5em}
\parinterval 构建一个强大的机器翻译系统需要``资源''和``系统''两方面共同作用。在资源方面,随着语料库语言学的发展,已经有大量高质量的双语和单语数据(称为语料)被整理并且电子化,研发机器翻译系统所需要的语料基础已经具备。特别是像英语、汉语等世界主流语种,相关语料资源已经非常丰富,这也大大加速了相关研究的进展。当然,对于一些稀缺资源语种或者特殊的领域,语料仍然很少,但是这些并不影响机器翻译领域整体的发展速度。在现有语料库的基础上,很多研究者可以把精力集中在``系统''上。但是,机器翻译并非易事,有以下几方面挑战:
\parinterval 构建一个强大的机器翻译系统需要``资源''和``系统''两方面共同作用。在资源方面,随着语料库语言学的发展,已经有大量高质量的双语和单语数据(称为语料)被整理并且电子化,研发机器翻译系统所需要的语料基础已经具备。特别是像英语、汉语等世界主流语种,相关语料资源已经非常丰富,这也大大加速了相关研究的进展。当然,对于一些稀缺资源语种或者特殊的领域,语料库仍然匮乏,但是这些并不影响机器翻译领域整体的发展速度。在现有语料库的基础上,很多研究者可以把精力集中在``系统''上。但是,机器翻译并非易事,有以下几方面挑战:
\vspace{0.5em}
\begin{itemize}
\item {\small\bfnew{自然语言翻译问题的复杂性极高}}。语言是人类进化的最高成就之一,自然语言具有高度的概括性、灵活性、多样性,这些都很难用几个简单的模型和算法进行描述,因此翻译问题的数学建模和计算机程序实现难度很大。虽然近几年Alpha Go等人工智能系统在围棋等领域取得了令人瞩目的成绩,但是相比翻译来说,围棋等棋类任务仍然``简单'',比如,对于一个句子,其潜在的译文几乎是不可穷尽的,即使同一句话不同人的理解也不尽相同,甚至在翻译一个句子、一个单词的时候,要考虑整个篇章的上下文语境,这些问题都不是传统棋类任务所具备的。
\item {\small\bfnew{自然语言翻译问题的复杂性极高}}。语言是人类进化的最高成就之一,自然语言具有高度的概括性、灵活性、多样性,这些都很难用几个简单的模型和算法进行描述。因此,翻译问题的数学建模和计算机程序实现难度很大。虽然近几年Alpha Go等人工智能系统在围棋等领域取得了令人瞩目的成绩,但是,相比翻译来说,围棋等棋类任务仍然``简单'',比如,对于一个句子,其潜在的译文几乎是不可穷尽的,即使同一句话不同人的理解也不尽相同,甚至在翻译一个句子、一个单词的时候,要考虑整个篇章的上下文语境,这些问题都不是传统棋类任务所具备的。
\vspace{0.5em}
\item {\small\bfnew{计算机的``理解''与人类的``理解''存在鸿沟}}。人类一直希望把自己进行翻译所使用的知识描述出来,并用计算机程序进行实现,包括早期基于规则的机器翻译方法都源自这个思想。但是经过实践发现,人和计算机在``理解''自然语言上存在着明显差异。首先,人类的语言能力是经过长时间多种外部环境因素共同刺激形成的,这种能力很难直接准确表达。也就是说人类的语言知识本身就很难描述,更不用说让计算机来理解;其次,人和机器翻译系统理解语言的目标不一样。人理解和使用语言是为了进行生活和工作,目标非常复杂,而机器翻译系统更多的是为了对某些数学上定义的目标函数进行优化。也就是说,机器翻译系统关注的是翻译这个单一目标,而并不是像人一样进行复杂的活动;此外,人和计算机的运行方式有着本质区别。人类语言能力的生物学机理与机器翻译系统所使用的计算模型本质上是不同的,机器翻译系统使用的是其自身能够理解的``知识'',比如,统计学上的词语表示。这种知识并不需要人来理解,当然从系统开发的角度,计算机也并不需要理解人是如何思考的。
\item {\small\bfnew{计算机的``理解''与人类的``理解''存在鸿沟}}。人类一直希望把自己翻译时所使用的知识描述出来,并用计算机程序进行实现,例如早期基于规则的机器翻译方法就源自这个思想。但是,经过实践发现,人和计算机在``理解''自然语言上存在着明显差异。首先,人类的语言能力是经过长时间多种外部环境因素共同作用形成的,这种能力很难直接准确表达。人类的语言知识本身就很难描述,更不用说让计算机来理解;其次,人和机器翻译系统理解语言的目的不一样。人理解和使用语言是为了进行生活和工作,而机器翻译系统更多的是为了对某些数学上定义的目标函数进行优化。也就是说,机器翻译系统关注的是翻译这个单一目标,而并不是像人一样进行复杂的活动;此外,人和计算机的运行方式有着本质区别。人类语言能力的生物学机理与机器翻译系统所使用的计算模型本质上是不同的,机器翻译系统使用的是其自身能够理解的``知识'',比如,统计学上的词语表示。这种``知识''并不需要人来理解,当然从系统开发的角度,计算机也并不需要理解人是如何思考的。
\vspace{0.5em}
\item {\small\bfnew{单一的方法无法解决多样的翻译问题}}。首先,语种的多样性会导致任意两种语言之间的翻译实际上都是不同的翻译任务。比如,世界上存在的语言不下几千种,如果任意两种语言进行互译就有上百万种翻译需求。虽然已经有研究者尝试用同一个框架甚至同一个翻译系统进行全语种的翻译,但是这类系统离真正可用还有很远的距离;此外,不同的领域,不同的应用场景对翻译也有不同的需求。比如,文学作品的翻译和新闻的翻译就有不同、口译和笔译也有不同,类似的情况不胜枚举。机器翻译需要适用多样的需求,这些又进一步增加了计算机建模的难度;还有,对于机器翻译来说,充足的高质量数据是必要的,但是不同语种、不同领域、不同应用场景所拥有的数据量有明显差异,甚至很多语种几乎没有可用的数据,这时开发机器翻译系统的难度可想而知。注意,现在的机器翻译还无法像人类一样在学习少量样例的情况下进行举一反三,因此数据稀缺情况下的机器翻译也给研究者带来了很大的挑战。
\item {\small\bfnew{单一的方法无法解决多样的翻译问题}}。首先,语种的多样性会导致任意两种语言之间的翻译实际上都是不同的翻译任务。比如,世界上存在的语言多达几千种,如果选择任意两种语言进行互译就产生上百万种翻译方向。虽然已经有研究者尝试用同一个框架甚至同一个翻译系统进行全语种的翻译,但是这类系统离真正可用还有很远的距离;其次,不同的领域,不同的应用场景对翻译也有不同的需求。比如,文学作品的翻译和新闻的翻译就有不同、口译和笔译也有不同,类似的情况不胜枚举。机器翻译要适用于多样的需求,这些又进一步增加了计算机建模的难度;再次,对于机器翻译来说,充足的高质量数据是必要的,但是不同语种、不同领域、不同应用场景所拥有的数据量有明显差异,甚至很多语种几乎没有可用的数据,这时开发机器翻译系统的难度可想而知。值得注意的是,现在的机器翻译还无法像人类一样在学习少量样例的情况下进行举一反三,因此数据稀缺情况下的机器翻译也给研究者带来了很大的挑战。
\end{itemize}
\vspace{0.5em}
\parinterval 显然,实现机器翻译并不简单,甚至有人把机器翻译看作是实现人工智能的终极目标。幸运的是,今天的机器翻译无论从技术方法上还是从应用上都有了巨大的飞跃,很多问题在不断被求解。如果有机会看到过十年之前机器翻译的结果,再对比今天的结果,一定会发现翻译品质已经今非昔比,很多译文已经非常准确且流畅。从当今机器翻译的前沿技术看,近三十年机器翻译的进步更多的得益于基于数据驱动方法和统计建模方法的使用。特别是,近些年深度学习等基于表示学习的端到端方法使得机器翻译的水平达到了新的高度。因此,本书将会对当代基于统计建模和深度学习方法的机器翻译模型、方法和系统实现进行全面介绍和分析,希望这些内容可以对相关内容的学习和科研工作提供参考。
\parinterval 显然,实现机器翻译并不简单,甚至有人把机器翻译看作是实现人工智能的终极目标。幸运的是,今天的机器翻译无论从技术方法上还是从应用上都有了巨大的飞跃,很多问题在不断被求解。如果你看到过十年前机器翻译的结果,再对比今天的结果,一定会感叹翻译质量的今非昔比,很多译文已经非常准确且流畅。从当今机器翻译的前沿技术看,近三十年机器翻译的进步更多的得益于基于数据驱动方法和统计建模方法的使用。特别是近些年深度学习等基于表示学习的端到端方法使得机器翻译的水平达到了新高度。因此,本书将会对基于统计建模和深度学习方法的机器翻译模型、方法和系统实现进行全面介绍和分析,希望这些内容可以对相关内容的学习和科研工作提供参考。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{机器翻译简史}\index{Chapter1.2}
\sectionnewpage
\section{机器翻译简史}
\parinterval 虽然翻译这个概念在人类历史中已经存在了上千年,机器翻译至今只有七十余年的历史。纵观机器翻译的发展,历程曲折又耐人寻味,可以说了解机器翻译的历史对我们深入理解相关技术方法会有很好的启发,甚至对我们了解整个自然语言处理领域的发展也有启示作用。
\parinterval 虽然翻译这个概念在人类历史中已经存在了上千年,但机器翻译发展至今只有七十余年的历史。纵观机器翻译的发展,历程曲折又耐人寻味,可以说了解机器翻译的历史对我们深入理解相关技术方法会有很好的启发,甚至对我们了解整个自然语言处理领域的发展也有启示作用。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{人工翻译}
\parinterval 人类形成语言文字的过程中逐渐形成了翻译的概念。一个著名的标志性证据是罗塞塔石碑(Rosetta Stone),如图\ref{fig:rosetta-stone}所示。这个石碑制作于公元前196年,据说是能够考证出来的最久远的记载平行文字的历史遗迹。石碑由上至下共刻有同一段埃及国王诏书的三种语言版本,最上面是古埃及象形文,中间是埃及草书,再下面是古希腊文。可以明显看出这个石碑上中下雕刻的文字的纹理是不同的。尽管用不同的语言文字描述同一件事在今天看来很常见,但是这在生产力低下的两千年前是很罕见的。很多人认为罗塞塔石碑是标志翻译或人工翻译的一个起点。目前这个石碑保存于大英博物馆,并成为了该馆最具代表性的镇馆之宝之一。
\parinterval 人类形成语言文字的过程中逐渐形成了翻译的概念。一个著名的标志性证据是罗塞塔石碑(Rosetta Stone),如图\ref{fig:rosetta-stone}所示。这个石碑制作于公元前196年,据说是可供考证的最久远的记载平行文字的历史遗迹。石碑由上至下刻有同一段埃及国王诏书的三种语言版本,最上面是古埃及象形文,中间是埃及草书,最下面是古希腊文。可以明显看出石碑上中下雕刻的文字的纹理是不同的。尽管用不同的语言文字描述同一件事在今天看来很常见,但是这在生产力低下的两千年前是很罕见的。很多人认为罗塞塔石碑是标志翻译或人工翻译的一个起点。目前罗塞塔石碑保存于大英博物馆,并成为该馆最具代表性的镇馆之宝之一。
%----------------------------------------------
% 图1.3
......@@ -85,14 +88,14 @@
\end{figure}
%------------------------------------------
\parinterval 随后,更多的翻译工作是在文化和知识传播中开展。其中一个典型代表是宗教文献的翻译。在人类的历史长河中,宗教是人类意识形态的一个重要载体。为了宣传教义,产生了大量的宗教文献。在西方,一项最早被记录的翻译活动是将旧约圣经(希伯来文及埃兰文)翻译为希腊文版本。并且迄今为止人类历史上翻译版本最多的书就是圣经。在中国唐代,一个著名的人物是玄奘,他不仅是佛学家、旅行家,还是翻译家。玄奘西行求法归来后把全部的心血和智慧奉献给了译经事业,在助手们的帮助下,共翻译佛教经论74部,1335卷,每卷万字左右,合计1335万字,占去整个唐代译经总数的一半以上,是翻译史上的杰出典范。
\parinterval 随后,更多的翻译工作在文化和知识传播中开展。其中一个典型代表是宗教文献的翻译。在人类的历史长河中,宗教是人类意识形态的一个重要载体。为了宣传教义,产生了大量的宗教文献。在西方,一项最早被记录的翻译活动是将旧约圣经(希伯来文及埃兰文)翻译为希腊文版本。并且迄今为止人类历史上翻译版本最多的书就是圣经。在中国唐代,有一位世界性的重量级文化人物\ \dash \ 玄奘,他不仅是佛学家、旅行家,还是翻译家。玄奘西行求法归来后把全部的心血和智慧奉献给了译经事业,在助手们的帮助下,共翻译佛教经论74部,1335卷,每卷万字左右,合计1335万字,占去整个唐代译经总数的一半以上,树立了我国古代翻译思想的光辉典范。
\parinterval 翻译在人类历史长河中起到了重要的作用。一方面,语言文字、文化和地理位置的差异性,使得翻译成为一个重要的需求;另一方面,翻译也加速了不同文明的融会贯通,促进了世界的发展。今天,翻译已经成为重要的行业之一,包括各个高校也都设立了翻译及相关专业,相关人才不断涌现。据《2019年中国语言服务行业发展报告》统计:全球语言服务产值预计将首次接近500亿美元;中国涉及语言服务的在营企业360,000余家,语言服务为主营业务的在营企业近万家,总产值超过300亿元,年增长3\%以上;全国开设外语类专业的高校数量多达上千所,其中设立有翻译硕士(MTI)和翻译本科(BTI)专业的院校分别有250余所和280余所,MTI累计招生数达6万余人\cite{赵军峰2019深化改革}。当然,面对着巨大的需求,如何使用技术手段提高人工翻译效率,比如:机器辅助翻译,也是人工翻译和机器翻译领域需要共同探索的方向。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{机器翻译的萌芽}
\parinterval 人工翻译已经存在了上千年,而机器翻译又起源于什么时候呢?机器翻译的兴起与发展可以说充满了跌宕起伏,整个发展史可以分为萌芽期、受挫期、快速成长期和爆发期。
\parinterval 人工翻译已经存在了上千年,而机器翻译又起源于什么时候呢?机器翻译的兴起与发展可以说充满了跌宕起伏,整个发展史可以分为萌芽期、受挫期、快速成长期和爆发期四个阶段
%----------------------------------------------
% 图1.4
......@@ -104,12 +107,20 @@
\end{figure}
%-------------------------------------------
\parinterval 世界上第一台通用电子数字计算机在1946年研制成功(图\ref{fig:eniac})。但在上世纪30年代使用计算模型进行自动翻译的思想就开始萌芽,当时法国科学家G.B. Artsouni提出了用机器来进行翻译的想法。在那个时代,第二次世界大战使得数学和密码学相当发达。由于战争的需要,消息传递变的更为隐秘,对文字进行加密和解密成为重要的军事需求。因此有人提出是否能用密码学的技术或方法解决人类语言的翻译,比如把汉语看成英语的一个加密文本,汉语翻译成英语就类似于解密的过程。当然这只是最初的想法。第一次提出机器翻译这个概念是在1949年,当时W. Weaver撰写了一篇名为《翻译》的备忘录,正式开创了机器翻译(MachineTranslation)的概念,这个概念一直沿用至今。当然,在那个年代进行机器翻译研究的很多条件还不具备,包括使用加密解密技术进行自动翻译的很多尝试很快也被验证是不可行的。不过这些早期的探索为后来机器翻译的发展提供了思想的火种。
\parinterval 世界上第一台通用电子数字计算机在1946年研制成功(图\ref{fig:eniac})。但在上世纪30年代使用计算模型进行自动翻译的思想就开始萌芽,当时法国科学家G.B. Artsouni提出了用机器来进行翻译的想法。
\parinterval 第二次世界大战使得数学和密码学相当发达,由于战争的需要,在那个时代消息传递变的更为隐秘,对文字进行加密和解密成为重要的军事需求。因此,有人提出是否能用密码学的技术或方法解决人类语言的翻译,比如把汉语看成英语的一个加密文本,汉语翻译成英语就类似于解密的过程。当然这只是最初的想法。第一次提出机器翻译这个概念是在1949年,当时W. Weaver撰写了一篇名为《翻译》的备忘录,正式开创了机器翻译(MachineTranslation)的概念,这个概念一直沿用至今。当然,在那个年代进行机器翻译研究还有很多条件不具备,包括使用加密解密技术进行自动翻译的很多尝试很快也被验证是不可行的。不过,这些早期的探索为后来机器翻译的发展提供了思想的火种。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{机器翻译的受挫}
\parinterval 随着电子计算机的发展,研究者开始尝试使用计算机来进行自动的翻译。但是事情并不总是一帆风顺,怀疑论者对机器翻译一直存有质疑,并很容易找出一些机器翻译无法解决的问题。自然地,人们也期望能够客观地评估一下机器翻译的可行性。当时美国基金资助组织委任自动语言处理咨询会承担了这项任务。经过近两年的调查与分析,该委员会于1966年11月公布了一个题为《语言与机器》的报告(图\ref{fig:report}),简称ALPAC报告。该报告全面否定了机器翻译的可行性,为机器翻译的研究泼了一盆冷水。随后美国政府终止了对机器翻译研究的支持,这导致整个产业界和学术界对机器翻译都开始回避。大家觉得机器翻译像伪科学,无论是发表论文还是申请项目都很难得到支持。没有了政府的支持,企业也无法进行大规模投入,机器翻译的研究就此受挫。从历史上看,包括机器翻译在内很多人工智能领域在那个年代并不受``待见'',其主要原因在于当时的技术水平还比较低,而大家又对机器翻译等技术的期望过高。最后发现,当时的机器翻译水平无法满足实际需要,因此转而排斥它。但是,也正是这一盆冷水,让人们可以更加冷静的思考机器翻译的发展方向,为后来的爆发蓄力。
\parinterval 随着电子计算机的发展,研究者开始尝试使用计算机来进行自动的翻译。但是事情并不总是一帆风顺,怀疑论者对机器翻译一直存有质疑,并很容易找出一些机器翻译无法解决的问题。自然地,人们也期望能够客观地评估一下机器翻译的可行性。当时美国基金资助组织委任自动语言处理咨询会承担了这项任务。
\parinterval 经过近两年的调查与分析,该委员会于1966年11月公布了一个题为《语言与机器》的报告(图\ref{fig:report}),简称ALPAC报告。该报告全面否定了机器翻译的可行性,为机器翻译的研究泼了一盆冷水。
\parinterval 随后美国政府终止了对机器翻译研究的支持,这导致整个产业界和学术界对机器翻译都开始回避。大家觉得机器翻译像伪科学,无论是发表论文还是申请项目都很难得到支持。没有了政府的支持,企业也无法进行大规模投入,机器翻译的研究就此受挫。
\parinterval 从历史上看,包括机器翻译在内很多人工智能领域在那个年代并不受``待见'',其主要原因在于当时的技术水平还比较低,而大家又对机器翻译等技术的期望过高。最后发现,当时的机器翻译水平无法满足实际需要,因此转而排斥它。但是,也正是这一盆冷水,让人们可以更加冷静的思考机器翻译的发展方向,为后来的爆发蓄力。
%----------------------------------------------
% 图1.5
......@@ -124,28 +135,28 @@
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{机器翻译的快速成长}
\parinterval 事物发展都是螺旋式上升的,机器翻译也是一样。上世纪70年代中后期,特别是80年代到90年代初,国家之间往来日益密切,而不同语言之间形成的交流障碍愈发严重,传统的人工作业方式已经远远不能满足需求。与此同时,语料库语言学的发展也为机器翻译提供了新的思路。其中,随着传统纸质文字资料不断电子化,计算机可读的语料越来越多。这使得人们可以用计算机对语言规律进行统计分析。另一方面,随着可用数据越来越多,用数学模型描述这些数据中的规律并进行推理逐渐成为可能。这也衍生出一类数学建模方法\ \dash\ {\small\bfnew{数据驱动}}(Data-Driven)的方法。这类方法也成为了随后出现的统计机器翻译的基础。
\parinterval 事物发展都是螺旋式上升的,机器翻译也是一样。上世纪70年代中后期,特别是80年代到90年代初,国家之间往来日益密切,而不同语言之间形成的交流障碍愈发严重,传统的人工作业方式已经远远不能满足需求。与此同时,语料库语言学的发展也为机器翻译提供了新的思路。其中,随着传统纸质文字资料不断电子化,计算机可读的语料越来越多,这使得人们可以用计算机对语言规律进行统计分析。另一方面,随着可用数据越来越多,用数学模型描述这些数据中的规律并进行推理逐渐成为可能。这也衍生出一类数学建模方法\ \dash\ {\small\bfnew{数据驱动}}\index{数据驱动}(Data-Driven)\index{Data-Driven}的方法。这类方法也成为了随后出现的统计机器翻译的基础。
传统的机器翻译方法,都需要人来书写规则,虽然对少部分句子具有较高的翻译精度,但这类方法对翻译现象的覆盖度有限,而且对规则或者模板中的噪声非常敏感,系统健壮性差。而基于数据驱动的方法不依赖于人写的规则,机器翻译的建模、训练和推断都可以自动地从数据中学习。这使得整个机器翻译的范式发生了翻天覆地的变化,比如基于实例的方法和统计机器翻译就是在此期间兴起的。此外,这样的方法使得机器翻译系统的开发代价大大地降低。从上世纪90年代到本世纪初,统计机器翻译发展迅猛,很快成为了当时机器翻译研究与应用的代表性方法。一个标志性的事件是谷歌推出了一个在线的免费自动翻译服务,也就是大家熟知的谷歌翻译。这使得机器翻译这种``高大上''的技术快速进入人们的生活,而不再是束之高阁的科研想法。也正是机器翻译不断走向实用,机器翻译的应用也越来越多,这反过来进一步促进了机器翻译的研究进程。比如,在2005-2015年间,统计机器翻译这个主题几乎统治了ACL等自然语言处理相关方向顶级会议的论文,可见其在当时的影响力。
传统的机器翻译方法,都需要人来书写规则,虽然对少部分句子具有较高的翻译精度,但这类方法对翻译现象的覆盖度有限,而且对规则或者模板中的噪声非常敏感,系统健壮性差。而基于数据驱动的方法不依赖于人写的规则,机器翻译的建模、训练和推断都可以自动地从数据中学习。这使得整个机器翻译的范式发生了翻天覆地的变化,比如,基于实例的方法和统计机器翻译就是在此期间兴起的。此外,这样的方法使得机器翻译系统的开发代价大大地降低。从上世纪90年代到本世纪初,统计机器翻译发展迅猛,很快成为了当时机器翻译研究与应用的代表性方法。一个标志性的事件是谷歌推出了一个在线的免费自动翻译服务,也就是大家熟知的谷歌翻译。这使得机器翻译这种``高大上''的技术快速进入人们的生活,而不再是束之高阁的科研想法。随着机器翻译不断走向实用,机器翻译的应用也越来越多,这反过来进一步促进了机器翻译的研究进程。比如,在2005-2015年间,统计机器翻译这个主题几乎统治了ACL等自然语言处理相关方向顶级会议的论文,可见其在当时的影响力。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{机器翻译的爆发}
\parinterval 2005年以后迎来了统计机器翻译发展的十年黄金时期,各种基于统计机器翻译模型层出不穷,经典的基于短语的模型和基于句法的模型也先后被提出。但是在2013年以后,机器学习的进步带来了机器翻译技术进一步提升。特别是基于神经网络的深度学习方法在机器视觉、语音识别中被成功应用,带来性能的飞跃式提升。很快,相关模型和方法也被用于机器翻译。对于机器翻译来说,深度学习的成功也是一种必然,原因如下:
\parinterval 2005年以后迎来了统计机器翻译发展的十年黄金时期,各种基于统计机器翻译模型层出不穷,经典的基于短语的模型和基于句法的模型也先后被提出。但是在2013年以后,机器学习的进步带来了机器翻译技术进一步提升。特别是基于神经网络的深度学习方法在机器视觉、语音识别中被成功应用,带来性能的飞跃式提升。很快,相关模型和方法也被用于机器翻译。对于机器翻译来说,深度学习的成功也是一种必然,原因如下:
\begin{itemize}
\item 第一,端到端学习不依赖于过多的先验假设。在统计机器翻译时代,模型设计或多或少会对翻译的过程进行假设,称为隐藏结构假设。比如基于短语的模型假设:源语言和目标语言都会被切分成短语序列,这些短语之间存在某种对齐关系。这种假设既有优点也有缺点,一方面,假设可以有助于模型融入人类的先验知识,包括短语本身也借鉴了语言学相关的概念;另一方面,假设越多模型受到的限制也越多。如果假设是正确的,模型可以很好描述问题。但如果假设错误,那么模型就可能产生偏差。深度学习不依赖于先验知识,也不需要手工设计特征,模型直接从输入和输出的映射上进行学习(端到端学习),这样也在一定程度上避免了隐藏结构假设造成的偏差。
\item 第一,端到端学习不依赖于过多的先验假设。在统计机器翻译时代,模型设计或多或少会对翻译的过程进行假设,称为隐藏结构假设。比如基于短语的模型假设:源语言和目标语言都会被切分成短语序列,这些短语之间存在某种对齐关系。这种假设既有优点也有缺点,一方面,假设可以有助于模型融入人类的先验知识,包括短语本身也借鉴了语言学相关的概念;另一方面,假设越多模型受到的限制也越多。如果假设是正确的,模型可以很好描述问题。但如果假设错误,那么模型就可能产生偏差。深度学习不依赖于先验知识,也不需要手工设计特征,模型直接从输入和输出的映射上进行学习(端到端学习),这样也在一定程度上避免了隐藏结构假设造成的偏差。
\item 第二,神经网络的连续空间模型有更强的表示能力。机器翻译中的一个基本问题是:如何表示一个句子?统计机器翻译把句子的生成过程看作是短语或者规则的推导,这本质上是一个离散空间上的符号系统。深度学习把传统的基于离散化的表示变成了连续空间的表示。比如,用实数空间的分布式表示代替了离散化的词语表示,而整个句子可以被描述为一个实数向量。这使得翻译问题可以在连续空间上描述,进而可以大大缓解传统离散空间模型维度灾难等问题。更重要的是,连续空间模型可以用梯度下降等方法进行优化,具有很好的数学性质并且易于实现。
\item 第二,神经网络的连续空间模型有更强的表示能力。机器翻译中的一个基本问题是:如何表示一个句子?统计机器翻译把句子的生成过程看作是短语或者规则的推导,这本质上是一个离散空间上的符号系统。深度学习把传统的基于离散化的表示变成了连续空间的表示。比如,用实数空间的分布式表示代替了离散化的词语表示,而整个句子可以被描述为一个实数向量。这使得翻译问题可以在连续空间上描述,进而大大缓解了传统离散空间模型维度灾难等问题。更重要的是,连续空间模型可以用梯度下降等方法进行优化,具有很好的数学性质并且易于实现。
\item 第三,深度网络学习算法的发展和GPU(Graphics Processing Unit)等并行计算设备为训练神经网络提供了可能。早期的基于神经网络的方法一直没有在机器翻译甚至自然语言处理领域得到大规模应用,其中一个重要的原因是这类方法需要大量的浮点运算,但是以前计算机的计算能力无法达到这个要求。随着GPU等并行计算设备的进步,训练大规模神经网络也变为了可能。现在已经可以在几亿、几十亿,甚至上百亿句对上训练机器翻译系统,系统研发的周期越来越短,进展日新月异。
\item 第三,深度网络学习算法的发展和GPU(Graphics Processing Unit)等并行计算设备为训练神经网络提供了可能。早期的基于神经网络的方法一直没有在机器翻译甚至自然语言处理领域得到大规模应用,其中一个重要的原因是这类方法需要大量的浮点运算,而且以前计算机的计算能力无法达到这个要求。随着GPU等并行计算设备的进步,训练大规模神经网络也变为了可能。现在已经可以在几亿、几十亿,甚至上百亿句对上训练机器翻译系统,系统研发的周期越来越短,进展日新月异。
\end{itemize}
\parinterval 今天,神经机器翻译已经成为新的范式,大有全面替代统计机器翻译之势。比如,从世界上著名的机器翻译比赛WMT和CCMT中就可以看出这个趋势。如图\ref{fig:wmt}所示,其中左图是WMT19全球机器翻译比赛的参赛队伍的截图,图中的队伍基本全是使用深度学习完成机器翻译的建模的。而在WMT19各个项目夺冠系统中(\ref{fig:wmt}右图),神经机器翻译也几乎一统天下。
\parinterval 今天,神经机器翻译已经成为新的范式,大有全面替代统计机器翻译之势。比如,从世界上著名的机器翻译比赛WMT和CCMT中就可以看出这个趋势。如图\ref{fig:wmt}所示,其中左图是WMT19全球机器翻译比赛的参赛队伍的截图,这些参赛队伍基本上都在使用深度学习完成机器翻译的建模。而在WMT19各个项目夺冠系统中(\ref{fig:wmt}右图),神经机器翻译也几乎一统天下。
\parinterval 值得一提的是近些年神经机器翻译的快速发展也得益于产业界的关注。各大互联网企业和机器翻译技术研发机构都对神经机器翻译的模型和实践方法给予了很大贡献。比如,谷歌,微软、百度、搜狗、腾讯、阿里、有道、小牛翻译等企业凭借自身人才和基础设施方面的优势,先后推出了以神经机器翻译为内核的产品及服务,相关技术方法已经在大规模应用中得到验证,大大推动了机器翻译的产业化进程,而且这种趋势在不断加强,机器翻译的前景也更加宽广。
\parinterval 值得一提的是近些年神经机器翻译的快速发展也得益于产业界的关注。各大互联网企业和机器翻译技术研发机构都对神经机器翻译的模型和实践方法给予了很大贡献。比如,谷歌,微软、百度、搜狗、腾讯、阿里、有道、小牛翻译等企业凭借自身人才和基础设施方面的优势,先后推出了以神经机器翻译为内核的产品及服务,相关技术方法已经在大规模应用中得到验证,大大推动了机器翻译的产业化进程,而且这种趋势在不断加强,机器翻译的前景也更加宽广。
%----------------------------------------------
% 图1.6
\begin{figure}[htp]
......@@ -158,11 +169,12 @@
\end{figure}
%-------------------------------------------
\section{机器翻译现状}\index{Chapter1.3}
\sectionnewpage
\section{机器翻译现状}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 机器翻译技术发展到今天已经经过无数次迭代,技术范式也经过若干次更替,近些年机器翻译的应用也如雨后春笋。但是大家都很好奇今天的机器翻译的质量究竟如何呢?乐观地说,在受限条件下,机器翻译的译文结果还是非常不错的,甚至可以接近人工翻译的结果。然而在开放式翻译任务中,机器翻译的结果却并不理想。更严格来说,机器翻译的质量远没有达到人们所期望的完美的程度。对于有些人提到的``机器翻译代替人工翻译''也并不是事实。比如,在高精度同声传译任务中,机器翻译仍需要更多打磨;再比如,针对于小说的翻译,机器翻译还无法做到与人工翻译媲美;甚至有人尝试用机器翻译系统翻译中国古代诗词,这里更多的是娱乐的味道。但是毫无疑问的是,机器翻译可以帮助人类,甚至有朝一日可以代替一些低端的人工翻译工作。
\parinterval 机器翻译技术发展到今天已经过无数次迭代,技术范式也经过若干次更替,近些年机器翻译的应用也如雨后春笋。但是大家都很好奇今天的机器翻译的质量究竟如何呢?乐观地说,在受限条件下,机器翻译的译文结果还是非常不错的,甚至可以接近人工翻译的结果。然而,在开放式翻译任务中,机器翻译的结果却并不理想。更严格来说,机器翻译的质量远没有达到人们所期望的完美的程度。对于有些人提到的``机器翻译代替人工翻译''也并不是事实。比如,在高精度同声传译任务中,机器翻译仍需要更多打磨;再比如,针对于小说的翻译,机器翻译还无法做到与人工翻译媲美;甚至有人尝试用机器翻译系统翻译中国古代诗词,这里更多的是娱乐的味道。但是毫无疑问的是,机器翻译可以帮助人类,甚至有朝一日可以代替一些低端的人工翻译工作。
\parinterval\ref{fig:results-zh-to-en news-field-translation}展示了机器翻译和人工翻译质量的一个对比结果。在汉语到英语的新闻翻译任务中,如果对译文进行人工评价,五分制机器翻译的译文得3.9 分,人的译文得4.7分(人的翻译也不是完美的)。可见,在这个任务中机器翻译表现不错,但是与人还有一定差距。如果换一种方式评价,把人的译文作为参考答案,用机器翻译的译文与其进行自动比对,会发现百分制机器翻译的得分只有47分。当然,这个结果并不是说机器翻译的译文质量很差,它更多的是表明机器翻译系统可以生成一些与人工翻译不同的译文,机器翻译也具有一定的创造性。这也类似于,很多围棋选手都想向AlphaGo学习,因为智能围棋系统也可以走出一些人类从未走过的妙招。
\parinterval\ref{fig:results-zh-to-en news-field-translation}展示了机器翻译和人工翻译质量的一个对比结果。在汉语到英语的新闻翻译任务中,如果对译文进行人工评价,五分制机器翻译的译文得3.9 分,人工译文得4.7分(人的翻译也不是完美的)。可见,在这个任务中机器翻译表现不错,但是与人还有一定差距。如果换一种方式评价,把人的译文作为参考答案,用机器翻译的译文与其进行比对,会发现机器翻译的得分只有47分(百分制)。当然,这个结果并不是说机器翻译的译文质量很差,它更多的是表明机器翻译系统可以生成一些与人工翻译不同的译文,机器翻译也具有一定的创造性。这也类似于,很多围棋选手都想向AlphaGo学习,因为智能围棋系统也可以走出一些人类从未走过的妙招。
%----------------------------------------------
% 图1.7
......@@ -175,7 +187,7 @@
\end{figure}
%-------------------------------------------
\parinterval\ref{fig:comparison-mt-ht}展示了一个真实的汉语到英语翻译实例。对比发现,机器翻译与人工翻译还是存在差距的,特别是在翻译一些具有感情色彩的词语时,机器翻译的译文缺一些味道。那么,机器翻译一点用都没有吗?显然不是。实际上,如果考虑翻译速度与翻译代价,机器翻译的价值是无可比拟的。还是同一个例子,如果人工翻译需要30分钟甚至更长时间,而机器翻译仅仅需要两秒,换种情况思考,如果有100万篇这样的文档,其人工翻译的成本根本无法想象,消耗的时间更是难以计算,而计算机集群仅仅需要一天,而且只有电力的消耗。
\parinterval\ref{fig:comparison-mt-ht}展示了一个真实的汉语到英语翻译实例。对比发现,机器翻译与人工翻译还是存在差距的,特别是在翻译一些具有感情色彩的词语时,机器翻译的译文缺一些味道。那么,机器翻译一点用都没有吗?显然不是。实际上,如果考虑翻译速度与翻译代价,机器翻译的价值是无可比拟的。还是同一个例子,翻译一篇短文如果人工翻译需要30分钟甚至更长时间,那么机器翻译仅仅需要两秒,换种情况思考,如果有100万篇这样的文档,其人工翻译的成本根本无法想象,消耗的时间更是难以计算,而计算机集群仅仅需要一天,而且只有电力的消耗。
%\\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\ \\
%----------------------------------------------
% 图1.9
......@@ -198,20 +210,23 @@
\end{figure}
%-------------------------------------------
\section{机器翻译方法}\index{Chapter1.4}
\sectionnewpage
\section{机器翻译方法}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 为了对机器翻译技术有一个整体的认识,这里对一些主要的机器翻译框架进行简要介绍。
\subsection{基于规则的机器翻译}\index{Chapter1.4.1}
\subsection{基于规则的机器翻译}
\parinterval 早期的机器翻译研究都是以基于规则的方法为主,特别是在上世纪70年代,以基于规则方法为代表的专家系统是人工智能中最具代表性的研究领域。它的主要思想是以词典和人工书写的规则库作为翻译知识,用一系列规则的组合完成翻译。
\parinterval\ref{fig:Example-RBMT}展示了一个使用规则进行翻译的实例。这里,利用一个简单的汉译英规则库完成对句子``我对你感到满意''的翻译。当翻译``我''时,从规则库中找到规则1,该规则表示遇到单词``我''就翻译为``I'';类似,也可以从规则库中找到规则4,该规则表示翻译调序,即将单词``you''放到``be satisfied with''后面。可以看到,这些规则的使用和进行翻译时所使用的思想非常类似,可以说基于规则方法实际上在试图描述人类进行翻译的思维过程。
\parinterval\ref{fig:Example-RBMT}展示了一个使用规则进行翻译的实例。这里,利用一个简单的汉译英规则库完成对句子``我对你感到满意''的翻译。当翻译``我''时,从规则库中找到规则1,该规则表示遇到单词``我''就翻译为``I'';类似,也可以从规则库中找到规则4,该规则表示翻译调序,即将单词``you''放到``be satisfied with''后面。可以看到,这些规则的使用和进行翻译时所使用的思想非常类似,可以说基于规则方法实际上在试图描述人类进行翻译的思维过程。
\parinterval 但是,基于规则的机器翻译也存在问题。首先,书写规则需要消耗大量人力,规则库的维护代价极高;其次,规则很难涵盖所有的语言现象;再有,自然语言存在大量的歧义现象,规则之间也会存在冲突,这也导致规则数量不可能无限制增长。
\subsection{基于实例的机器翻译}\index{Chapter1.4.2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 基于规则的方法更多的被使用在受限翻译场景中,比如受限词汇集的翻译。针对基于规则的方法存在的问题,基于实例的机器翻译于上世纪80年代中期被提出\cite{nagao1984framework}。该方法的基本思想是在双语句库中找到与待翻译句子相似的实例,之后对实例的译文进行必要修改,如替换、增加、删除等一系列操作,从而得到最终译文。这个过程可以类比人类学习并运用语言的过程:人会先学习一些翻译实例或者模板,当遇到新的句子时,会用以前的实例和模板作对比,之后得到新的句子的翻译结果。这也是一种举一反三的思想。
\subsection{基于实例的机器翻译}
\parinterval 基于规则的方法更多地被使用在受限翻译场景中,比如受限词汇集的翻译。针对基于规则的方法存在的问题,基于实例的机器翻译于上世纪80年代中期被提出\cite{nagao1984framework}。该方法的基本思想是在双语句库中找到与待翻译句子相似的实例,之后对实例的译文进行修改,如替换、增加、删除等一系列操作,从而得到最终译文。这个过程可以类比人类学习并运用语言的过程:人会先学习一些翻译实例或者模板,当遇到新的句子时,会用以前的实例和模板作对比,之后得到新的句子的翻译结果。这也是一种举一反三的思想。
%----------------------------------------------
% 图1.10
\begin{figure}[htp]
......@@ -224,11 +239,12 @@
\parinterval\ref{fig:zh-sentences-into-en-sentences}展示了一个基于实例的机器翻译过程。它利用简单的翻译实例库与翻译词典完成对句子``我对你感到满意''的翻译。首先,使用待翻译句子的源语言端在翻译实例库中进行比较,根据相似度大小找到相似的实例``我对他感到高兴''。然后,标记实例中不匹配的部分,即``你''和``他'',``满意''和``高兴''。再查询翻译词典得到词``你''和``满意''所对应的翻译结果``you''和``satisfied'',用这两个词分别替换实例中的``him''和``happy'',从而得到最终译文。
\parinterval 当然,基于实例的机器翻译也并不完美。首先,这种方法对翻译实例的精确度要求非常高,一个实例的错误可能会导致一个句型都无法翻译正确;其次,实例维护较为困难,实例库的构建通常需要单词级对齐的标注,而保证词对齐的质量是非常困难的工作,这也大大增加了实例库维护的难度;再,尽管可以通过实例或者模板进行翻译,但是其覆盖度仍然有限。在实际应用中,很多句子无法找到可以匹配的实例或者模板。
\parinterval 当然,基于实例的机器翻译也并不完美。首先,这种方法对翻译实例的精确度要求非常高,一个实例的错误可能会导致一个句型都无法翻译正确;其次,实例维护较为困难,实例库的构建通常需要单词级对齐的标注,而保证词对齐的质量是非常困难的工作,这也大大增加了实例库维护的难度;再,尽管可以通过实例或者模板进行翻译,但是其覆盖度仍然有限。在实际应用中,很多句子无法找到可以匹配的实例或者模板。
\subsection{统计机器翻译}\index{Chapter1.4.3}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 统计机器翻译兴起于上世纪90年代\cite{brown1990statistical,koehn2003statistical}它利用统计模型从单/双语语料中自动学习翻译知识。具体来说,可以使用单语语料学习语言模型,使用双语平行语料学习翻译模型,并使用这些统计模型完成对翻译过程的建模。整个过程不需要人工编写规则,也不需要从实例中构建翻译模板。无论是词、短语,甚至句法结构,统计机器翻译系统都可以自动学习,人更多的是参与定义翻译所需的特征和基本翻译单元的形式。而翻译知识都保存在模型的参数中。
\subsection{统计机器翻译}
\parinterval 统计机器翻译兴起于上世纪90年代\cite{brown1990statistical,koehn2003statistical}它利用统计模型从单/双语语料中自动学习翻译知识。具体来说,可以使用单语语料学习语言模型,使用双语平行语料学习翻译模型,并使用这些统计模型完成对翻译过程的建模。整个过程不需要人工编写规则,也不需要从实例中构建翻译模板。无论是词还是短语,甚至是句法结构,统计机器翻译系统都可以自动学习。人更多的是参与定义翻译所需的特征和基本翻译单元的形式,而翻译知识都保存在模型的参数中。
%----------------------------------------------
% 图1.11
\begin{figure}[htp]
......@@ -241,11 +257,12 @@
\parinterval\ref{fig:Example-SMT}展示了一个统计机器翻译系统运行的简单实例。整个系统需要两个模型:翻译模型和语言模型。其中,翻译模型从双语平行语料中学习翻译知识,得到短语表,其中包含各种词汇的翻译及其概率,这样可以度量源语言和目标语言片段之间互为翻译的可能性大小;语言模型从单语语料中学习目标语的词序列生成规律,来衡量目标语言译文的流畅性。最后,将这两种模型联合使用,翻译引擎来搜索尽可能多的翻译结果,并计算不同翻译结果的可能性大小,最后将概率最大的译文作为最终结果输出。这个过程并没有显性使用人工翻译规则和模板,译文的生成仅仅依赖翻译模型和语言模型中的统计参数。
\parinterval 由于没有对翻译过程进行过多的限制,统计机器翻译有很灵活的译文生成方式,因此系统可以处理更加多样的句子。但是这种方法也带来了一些问题:首先,虽然并不需要人工定义翻译规则或模板,统计机器翻译系统仍然需要人工定义翻译特征。提升翻译品质往往需要大量的特征工程,导致人工特征设计的好坏对系统往往产生决定性影响;其次,统计机器翻译的模块较多,系统研发比较复杂;还有,随着训练数据增多,统计机器翻译的模型(比如短语翻译表)会明显增加,这也在一定程度上妨碍了系统在存储资源受限情况下使用。
\parinterval 由于没有对翻译过程进行过多的限制,统计机器翻译有很灵活的译文生成方式,因此系统可以处理更加多样的句子。但是这种方法也带来了一些问题:首先,虽然并不需要人工定义翻译规则或模板,统计机器翻译系统仍然需要人工定义翻译特征。提升翻译品质往往需要大量的特征工程,导致人工特征设计的好坏会对系统产生决定性影响;其次,统计机器翻译的模块较多,系统研发比较复杂;再次,随着训练数据增多,统计机器翻译的模型(比如短语翻译表)会明显增大,在系统存储资源受限的情况下,妨碍系统的正常使用。
\subsection{神经机器翻译}\index{Chapter1.4.4}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 随着机器学习技术的发展,基于深度学习的神经机器翻译逐渐开始兴起。自2014年开始,它在短短几年内已经在大部分任务上取得了明显的优势\cite{NIPS2014_5346,bahdanau2014neural}神经机器翻译中,词串被表示成实数向量,即分布式向量表示。这样,翻译过程并不是在离散化的单词和短语上进行,而是在实数向量空间上计算,因此它对词序列表示的方式产生了本质的改变。通常,机器翻译可以被看作一个序列到另一个序列的转化。在神经机器翻译中,序列到序列的转化过程可以由{\small\bfnew{编码器-解码器}}(encoder-decoder)框架实现。其中,编码器把源语言序列进行编码,并提取源语言中信息进行分布式表示,之后解码器再把这种信息转换为另一种语言的表达。
\subsection{神经机器翻译}
\parinterval 随着机器学习技术的发展,基于深度学习的神经机器翻译逐渐开始兴起。自2014年开始,它在短短几年内已经在大部分任务上取得了明显的优势\cite{NIPS2014_5346,bahdanau2014neural}神经机器翻译中,词串被表示成实数向量,即分布式向量表示。这样,翻译过程并不是在离散化的单词和短语上进行,而是在实数向量空间上计算,因此它对词序列表示的方式产生了本质的改变。通常,机器翻译可以被看作一个序列到另一个序列的转化。在神经机器翻译中,序列到序列的转化过程可以由{\small\bfnew{编码器-解码器}}\index{编码器-解码器}(encoder-decoder)\index{encoder-decoder}框架实现。其中,编码器把源语言序列进行编码,并提取源语言中信息进行分布式表示,之后解码器再把这种信息转换为另一种语言的表达。
%----------------------------------------------
% 图1.12
......@@ -259,17 +276,17 @@
\parinterval\ref{fig:Example-NMT}展示了一个神经机器翻译的实例。首先,通过编码器,源语言序列``我对你感到满意''经过多层神经网络编码生成一个向量表示,即图中的向量(0.2,-1,6,5,0.7,-2)。再将该向量作为输入送到解码器中,解码器把这个向量解码成目标语言序列。注意,目标语言序列的生成是逐词进行的(虽然图中展示的是解码器生成整个序列,但是在具体实现时是逐个单词生成目标语译文),产生某个词的时候依赖之前生成的目标语言的历史信息,直到产生句子结束符为止。
\parinterval 相比统计机器翻译,神经机器翻译的优势体现在其不需要特征工程,所有信息由神经网络自动从原始输入中提取。而且,相比离散化的表示,词和句子的分布式连续空间表示可以为建模提供更为丰富的信息,同时可以使用相对成熟的基于梯度的方法优化模型。此外,神经网络的存储需求较小,天然适合小设备上的应用。但是,神经机器翻译也存在问题。首先,虽然脱离了特征工程,神经网络的结构需要人工设计,即使设计好结构,系统的调优、超参数的设置等等仍然依赖大量的实验;其次,神经机器翻译现在缺乏可解释性,其过程和人的认知差异很大,通过人的先验知识干预的程度差;再有,神经机器翻译对数据的依赖很大,数据规模、质量对性能都有很大影响,特别是在数据稀缺的情况下,充分训练神经网络具有挑战。
\parinterval 相比统计机器翻译,神经机器翻译的优势体现在其不需要特征工程,所有信息由神经网络自动从原始输入中提取。而且,相比离散化的表示,词和句子的分布式连续空间表示可以为建模提供更为丰富的信息,同时可以使用相对成熟的基于梯度的方法优化模型。此外,神经网络的存储需求较小,天然适合小设备上的应用。但是,神经机器翻译也存在问题。首先,虽然脱离了特征工程,神经网络的结构需要人工设计,即使设计好结构,系统的调优、超参数的设置等仍然依赖大量的实验;其次,神经机器翻译现在缺乏可解释性,其过程和人的认知差异很大,通过人的先验知识干预的程度差;再次,神经机器翻译对数据的依赖很大,数据规模、质量对性能都有很大影响,特别是在数据稀缺的情况下,充分训练神经网络具有挑战。
\subsection{对比分析}\index{Chapter1.4.5}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{对比分析}
\parinterval 不同机器翻译方法有不同的特点。表\ref{tab:comparison-of-different-MT}对比了这些方法,不难看出:
\vspace{0.5em}
\begin{itemize}
\vspace{0.5em}
\item 规则系统需要人工书写规则并维护,人工代价较高。统计和神经网络方法仅需要设计特征或者神经网络结构,对(语言相关的)人工依赖较少
\item 规则系统需要人工书写规则并维护,人工代价较高。统计和神经网络方法仅需要设计特征或者神经网络结构,对人工依赖较少(语言相关的)
\vspace{0.5em}
\item 基于实例、统计和神经网络的方法都需要依赖语料库(数据),其中统计和神经网络方法具有一定的抗噪能力,因此也更适合大规模数据情况下的机器翻译系统研发。
\vspace{0.5em}
......@@ -301,16 +318,19 @@
}\end{table}
%-------------------------------------------
\parinterval 从现在机器翻译的研究和应用情况来看,基于统计建模的方法(统计机器翻译和神经机器翻译)是主流。这主要是由于它们的系统研发周期短,通过搜集一定量的数据即可实现快速原型。是随着互联网等信息的不断开放,低成本的数据获取可以让系统更快得以实现。特别是,最近神经机器翻译凭借其高质量的译文,受到研究人员和开发者的广泛青睐。当然,不同方法之间的融合也是有价值的方向,也有很多有趣的探索,比如无指导机器翻译中还是会同时使用统计机器翻译和神经机器翻译方法,这也是一种典型的融合多种方法的思路。
\parinterval 从现在机器翻译的研究和应用情况来看,基于统计建模的方法(统计机器翻译和神经机器翻译)是主流。这主要是由于它们的系统研发周期短,通过搜集一定量的数据即可实现快速原型。是随着互联网等信息的不断开放,低成本的数据获取可以让系统更快得以实现。特别是最近神经机器翻译凭借其高质量的译文,受到研究人员和开发者的广泛青睐。当然,对不同方法进行融合也是有价值的研究方向,也有很多有趣的探索,比如无指导机器翻译中还是会同时使用统计机器翻译和神经机器翻译方法,这也是一种典型的融合多种方法的思路。
\section{翻译质量评价}\index{Chapter1.5}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\sectionnewpage
\section{翻译质量评价}
\parinterval 机器翻译质量的评价对于机器翻译的发展具有至关重要的意义。首先,评价的结果可以用于指导研究人员不断改进机器翻译结果,并找到最具潜力的技术发展方向。同时,一个权威的翻译质量评价指标可以帮助用户更有效地使用机器翻译的结果。
\parinterval 一般来说,机器翻译的翻译{\small\bfnew{质量评价}}(Quality Evaluation)是指在参考答案或者评价标准已知的情况下对译文进行打分。这类方法可以被称作有参考答案的评价,包括人工打分、BLEU 等自动评价方法都是典型的有参考答案评价。相对的,{\small\bfnew{无参考答案的评价}}(Quality Estimation)是指在没有人工评价和参考答案的情况下,对译文质量进行评估。这类方法可以被看作是对机器翻译译文进行质量`` 预测'',这样用户可以选择性的使用机器翻译结果。这里主要讨论有参考答案的评价,因为这类方法是机器翻译系统研发所使用的主要评价方法。
\parinterval 一般来说,机器翻译的翻译{\small\bfnew{质量评价}}\index{质量评价}(Quality Evaluation)\index{Quality Evaluation}是指在参考答案或者评价标准已知的情况下对译文进行打分。这类方法可以被称作有参考答案的评价,包括人工打分、BLEU 等自动评价方法都是典型的有参考答案评价。相对的,{\small\bfnew{无参考答案的评价}}\index{无参考答案的评价}(Quality Estimation)\index{Quality Estimation}是指在没有人工评价和参考答案的情况下,对译文质量进行评估。这类方法可以被看作是对机器翻译译文进行质量`` 预测'',这样用户可以选择性的使用机器翻译结果。这里主要讨论有参考答案的评价,因为这类方法是机器翻译系统研发所使用的主要评价方法。
\subsection{人工评价}\index{Chapter1.5.1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{人工评价}
\parinterval 顾名思义,人工评价是指专家根据翻译结果好坏对译文进行评价。一般会根据句子的准确度和流利度对其进行打分,这样能够准确评定出句子是否准确翻译出原文的意思以及句子是否通顺。在对一个句子进行评定时,一般由多个专家匿名打分后进行综合评定。人工评价是最能准确反映句子翻译质量的评价方式,但是其缺点也十分明显:需要耗费人力物力,而且评价的周期长,不能及时得到有效的反馈。因此在实际系统开发中,纯人工评价不会过于频繁的被使用,它往往和自动评价一起配合,帮助系统研发人员准确的了解当前系统的状态。
\parinterval 人工评价的策略非常多。考虑不同的因素,往往会使用不同的评价方案,比如:
......@@ -333,15 +353,18 @@
\end{itemize}
\parinterval 简而言之,研究者可以根据实际情况选择不同的人工评价方案,人工评价也没有统一的标准。WMT和CCMT机器翻译评测都有配套的人工评价方案,可以作为业界的参考标准。
\subsection{自动评价}\index{Chapter1.5.2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{自动评价}
\parinterval 由于人工评价费事费力,同时具有一定的主观性,甚至同一篇文章不同人在不同时刻的理解都会不同,因此自动评价是也是机器翻译系统研发人员所青睐的方法。自动评价的方式虽然不如人工评价准确,但是具有速度快,成本低、一致性高的优点。而且随着评价技术的不断发展,自动评价方式已经具有了比较好的指导性,可以帮助使用者快速了解当前机器翻译译文的质量。在机器翻译领域,自动评价已经成为了一个重要的分支,提出的自动评价方法不下几十种。这里无法对这些方法一一列举,为了便于后续章节的描述,这里仅对具有代表性的一些方法进行简要介绍。
\subsubsection{BLEU}\index{Chapter1.5.2.1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\parinterval 目前使用最广泛的自动评价指标是BLEU。BLEU是Bilingual Evaluation understudy的缩写,最早由IBM在2002年提出\cite{papineni2002bleu}。通过采用$n$-gram匹配的方式评定机器翻译结果和参考译文之间的相似度,即机器翻译的结果越接近人工参考译文就认定它的质量越高。$n$-gram是指$n$个连续单词组成的单元,称为{\small\bfnew{$n$元语法单元}}$n$越大表示评价时考虑的匹配片段越大。
\subsubsection{BLEU}
\parinterval BLEU的计算首先考虑待评价译文中$n$-gram在参考答案中的匹配率,称为{\small\bfnew{$n$-gram准确率}}$n$-gram Precision)。其计算方法如下:
\parinterval 目前使用最广泛的自动评价指标是BLEU。BLEU是Bilingual Evaluation understudy的缩写,最早由IBM在2002年提出\cite{papineni2002bleu}。通过采用$n$-gram匹配的方式评定机器翻译结果和参考译文之间的相似度,即机器翻译的结果越接近人工参考译文就认定它的质量越高。$n$-gram是指$n$个连续单词组成的单元,称为{\small\bfnew{$n$元语法单元}}\index{$n$元语法单元}$n$越大表示评价时考虑的匹配片段越大。
\parinterval BLEU的计算首先考虑待评价译文中$n$-gram在参考答案中的匹配率,称为{\small\bfnew{$n$-gram准确率}}\index{$n$-gram准确率}$n$-gram Precision)\index{$n$-gram Precision}。其计算方法如下:
\begin{eqnarray}
\textrm{P}_n=\frac{\textrm{Count}_\textrm{hit}}{\textrm{Count}_{\textrm{output}}}
\label{eq:matching-rate}
......@@ -363,7 +386,7 @@ Candidate:the the the the
\label{eq:weighted-average}
\end{eqnarray}
\parinterval 但是,该方法更倾向于对短句子打出更高的分数。一个极端的例子是译文只有很少的几个词,但是都命中答案,准确率很高可显然不是好的译文。因此,BLEU引入{\small\bfnew{短句惩罚因子}}(Brevity Penalty, BP)的概念,对短句进行惩罚,
\parinterval 但是,该方法更倾向于对短句子打出更高的分数。一个极端的例子是译文只有很少的几个词,但是都命中答案,准确率很高可显然不是好的译文。因此,BLEU引入{\small\bfnew{短句惩罚因子}}\index{短句惩罚因子}(Brevity Penalty\index{Brevity Penalty}, BP)的概念,对短句进行惩罚,
\begin{eqnarray}
\textrm{BP}=
......@@ -383,8 +406,9 @@ e^{(1-\frac{r}{c})}& c \le r
\parinterval 从机器翻译的发展来看,BLEU的意义在于它给系统研发人员提供了一种简单、高效、可重复的自动评价手段,在研发机器翻译系统时可以不需要依赖人工评价。同时,BLEU也有很多创新之处,包括引入$n$-gram的匹配,截断计数和短句惩罚等等,包括NIST等很多评价指标都是受到BLEU的启发。当然, BLEU也并不完美,甚至经常被人诟病。比如,它需要依赖参考译文,而且评价结果有时与人工评价不一致,同时BLEU评价只是单纯的从匹配度的角度思考翻译质量的好坏,并没有真正考虑句子的语义是否翻译正确。但是,毫无疑问,BLEU仍然是机器翻译中最常用的评价方法。在没有找到更好的替代方案之前,BLEU还是机器翻译研究所使用的标准评价指标。
\subsubsection{TER}\index{Chapter1.5.2.2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{TER}
\parinterval TER是Translation Edit Rate的缩写,是一种基于距离的评价方法,用来评定机器翻译结果的译后编辑的工作量\cite{snover2006study}。这里,距离被定义为将一个序列转换成另一个序列所需要的最少编辑操作次数。操作次数越多,距离越大,序列之间的相似性越低;相反距离越小,表示一个句子越容易改写成另一个句子,序列之间的相似性越高。TER使用的编辑操作包括:增加,删除,替换和移位,其中增加,删除,替换操作计算得到的距离被称为编辑距离,并根据错误率的形式给出评分:
\begin{eqnarray}
\textrm{score}=\frac{\textrm{edit}(c,r)}{l}
......@@ -403,8 +427,9 @@ Candidate:cat is standing in the ground
\parinterval 与BLEU不同,基于距离的评价方法是一种典型的``错误率''的度量,类似的思想也广泛应用于语音识别等领域。在机器翻译中,除了TER外,还有WER, PER等十分相似的方法,只是在``错误''的定义上略有不同。需要注意的是,很多时候,研究者并不会单独使用BLEU或者TER,而是将两种方法融合,比如,使用BLEU与TER相减后的值作为评价指标(BLEU和TER之间是减号)。
\subsubsection{基于检测点的评价}\index{Chapter1.5.2.3}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{基于检测点的评价}
\parinterval BLEU、TER等评价指标可以对译文的整体质量进行评估,但是缺乏对具体问题的细致评价。很多时候,研究人员需要知道系统是否能够处理特定的问题,而不是得到一个笼统的评价结果。基于监测点的方法正是基于此想法\cite{shiwen1993automatic}。基于检测点的评价的优点在于对机器翻译系统给出一个总体评价的同时针对系统在各个具体问题上的翻译能力进行评估,方便比较不同翻译模型的性能。这种方法也被多次用于机器翻译比赛的质量评测。
\parinterval 基于检测点的评价是根据事先定义好的语言学检测点对译文的相应部分进行打分。如下是几个英中翻译中的检测点实例:
......@@ -439,8 +464,10 @@ His house is on the south bank of the river.
\parinterval 基于检测点的评价方法的意义在于,它并不是简单给出一个分数,而是帮助系统研发人员定位问题。因此这类方法更多的使用在对机器翻译的结果进行分析上,是对BLEU等整体评价指标的一种很好的补充。
\section{机器翻译应用}\index{Chapter1.6}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\sectionnewpage
\section{机器翻译应用}
\parinterval 机器翻译有着十分广泛的应用,下面看一下机器翻译生活中的具体应用形式:
\parinterval (一)网页翻译
......@@ -493,17 +520,20 @@ His house is on the south bank of the river.
\parinterval 翻译结果后编辑是指在机器翻译的结果之上,通过少量的人工编辑来进一步完善机器译文。在传统的人工翻译过程中,翻译人员完全依靠人工的方式进行翻译,这虽然保证了翻译质量,但是时间成本高。相对应的,机器翻译具有速度快和成本低的优势。在一些领域,目前的机器翻译质量已经可以很大程度上减小翻译人员的工作量,翻译人员可以在机器翻译的辅助下,花费相对较小的代价来完成翻译。
\section{开源项目与评测}\index{Chapter1.7}
\sectionnewpage
\section{开源项目与评测}
\parinterval 从实践的角度,机器翻译的发展主要可以归功于两方面的推动作用:开源系统和评测。开源系统通过代码共享的方式使得最新的研究成果可以快速传播,同时实验结果可以复现。而评测比赛,使得各个研究组织的成果可以进行科学的对比,共同推动机器翻译的发展与进步。此外,开源项目也促进了不同团队之间的协作,让研究人员在同一个平台上集中力量攻关。
\subsection{开源机器翻译系统}\index{Chapter1.7.1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{开源机器翻译系统}
下面列举一些优秀的开源机器翻译系统:
\subsubsection{统计机器翻译开源系统}\index{Chapter1.7.1.1}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{统计机器翻译开源系统}
\vspace{0.5em}
\begin{itemize}
\item NiuTrans.SMT:NiuTrans\cite{Tong2012NiuTrans}是由东北大学自然语言处理实验室自主研发的统计机器翻译系统,该系统可支持基于短语的模型、基于层次短语的模型以及基于句法的模型。由于使用C++ 语言开发,所以该系统运行时间快,所占存储空间少。系统中内嵌有$n$-gram语言模型,故无需使用其他的系统即可对完成语言建模。网址:\url{http://opensource.niutrans.com/smt/index.html}
......@@ -530,8 +560,9 @@ His house is on the south bank of the river.
\end{itemize}
\vspace{0.5em}
\subsubsection{神经机器翻译开源系统}\index{Chapter1.7.1.2}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{神经机器翻译开源系统}
\vspace{0.5em}
\begin{itemize}
\item GroundHog:GroundHog\cite{bahdanau2014neural}基于Theano\cite{al2016theano}框架,由蒙特利尔大学LISA 实验室使用Python语言编写的一个框架,旨在提供灵活而高效的方式来实现复杂的循环神经网络模型。它提供了包括LSTM在内的多种模型。Bahdanau等人在此框架上又编写了GroundHog神经机器翻译系统。该系统也作为了很多论文的基线系统。网址:\url{https://github.com/lisa-groundhog/GroundHog}
......@@ -564,7 +595,7 @@ His house is on the south bank of the river.
\end{itemize}
\vspace{0.5em}
\subsection{常用数据集及公开评测任务}\index{Chapter1.7.2}
\subsection{常用数据集及公开评测任务}
\parinterval 机器翻译相关评测主要有两种组织形式,一种是由政府及国家相关机构组织,权威性强。如由美国国家标准技术研究所组织的NIST评测、日本国家科学咨询系统中心主办的NACSIS Test Collections for IR(NTCIR)PatentMT、日本科学振兴机构(Japan Science and Technology Agency,简称JST)等组织联合举办的Workshop on Asian Translation(WAT)以及国内由中文信息学会主办的全国机器翻译大会(China Conference on Machine Translation,简称CCMT);另一种是由相关学术机构组织,具有领域针对性的特点,如倾向新闻领域的Workshop on Statistical Machine Translation(WMT)以及面向口语的International Workshop on Spoken Language Translation(IWSLT)。下面将针对上述评测进行简要介绍。
......@@ -589,7 +620,8 @@ His house is on the south bank of the river.
\parinterval 从机器翻译发展的角度看,这些评测任务给相关研究提供了基准数据集,使得不同的系统都可以在同一个环境下进行比较和分析,进而建立了机器翻译研究所需的实验基础。此外,公开评测也使得研究者可以第一时间了解机器翻译研究的最新成果,比如,有多篇ACL会议最佳论文的灵感就来自当年参加机器翻译评测任务的系统。
\section{推荐学习资源}\index{Chapter1.8}
\sectionnewpage
\section{推荐学习资源}
\parinterval 首先,推荐一本书《Statistical Machine Translation》\cite{koehn2009statistical},其作者是机器翻译领域著名学者Philipp Koehn教授。该书是机器翻译领域内的经典之作,介绍了统计机器翻译技术的进展。该书从语言学和概率学两个方面介绍了统计机器翻译的构成要素,然后介绍了统计机器翻译的主要模型:基于词、基于短语和基于树的模型,以及机器翻译评价、语言建模、判别式训练等方法。此外,作者在该书的最新版本中增加了神经机器翻译的章节,方便研究人员全面了解机器翻译的最新发展趋势(\cite{DBLP:journals/corr/abs-1709-07809})。
......
......@@ -6,7 +6,7 @@
\begin{tikzpicture}
\begin{scope}[scale=1.0,xshift=0.9in,yshift=-0.87in,level distance=20pt,sibling distance=-1pt,grow'=up]
\begin{scope}[scale=1.0,level distance=30pt,sibling distance=15pt,grow'=up]
{
\Tree[.\node(sn0){IP};
[.\node(sn1){NP};
......
......@@ -8,7 +8,7 @@
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
%\renewcommand\arraystretch{1.5}%将表格高度调整为1.5倍
\chapterimage{chapter_head_1.pdf} % Chapter heading image
\chapterimage{fig-NEU-3.jpg} % Chapter heading image
\chapter{词法、语法及统计建模基础}
......@@ -21,7 +21,7 @@
\parinterval 语言建模是机器翻译中最常用的一种技术,它主要用于句子的生成和流畅度评价。本章会以传统统计语言模型为例,对语言建模的相关概念进行介绍。但是,这里并不深入探讨语言模型技术,在后面的章节中还会有单独的内容对神经网络语言模型等前沿技术进行讨论。
%--问题概述-----------------------------------------
\section{问题概述 }\index{Chapter2.1}
\section{问题概述 }
\parinterval 很多时候机器翻译系统被看作是孤立的``黑盒''系统(图 \ref {fig:2.1-1} (a))。可以将一段文本作为输入送入机器翻译系统,之后得到翻译好的译文输出。但是真实的机器翻译系统要复杂的多。因为系统看到的输入和输出的实际上只是一些符号串,这些符号并没有任何其他意义,因此需要进一步对这些符号串进行处理才能更好的使用它们,比如,需要定义翻译中最基本的单元是什么?符号串是否还有结构信息?如何用数学工具刻画这些基本单元和结构?
......@@ -50,15 +50,15 @@
\vspace{0.5em}
\begin{itemize}
\item {\small\bfnew{分词}}(Segmentation):这个过程会把词串进行切分,切割成最小的单元。因为只有知道了什么是待处理字符串的最小单元,机器翻译系统才能对其进行表示、分析和生成。
\item {\small\bfnew{分词}}\index{分词}(Segmentation)\index{Segmentation}:这个过程会把词串进行切分,切割成最小的单元。因为只有知道了什么是待处理字符串的最小单元,机器翻译系统才能对其进行表示、分析和生成。
\vspace{0.5em}
\item {\small\bfnew{句法分析}}(Parsing):这个过程会对分词的结果进行进一步分析,得到句子的句法结构,这种结构是对句子的进一步抽象。比如,NP+VP就可以表示由名词短语(NP)和动词短语(VP)构成的主谓结构。利用这些信息,机器翻译可以更加准确的对语言的结构进行分析和生成。
\item {\small\bfnew{句法分析}}\index{句法分析}(Parsing)\index{Parsing}:这个过程会对分词的结果进行进一步分析,得到句子的句法结构,这种结构是对句子的进一步抽象。比如,NP+VP就可以表示由名词短语(NP)和动词短语(VP)构成的主谓结构。利用这些信息,机器翻译可以更加准确的对语言的结构进行分析和生成。
\end{itemize}
\vspace{0.5em}
\parinterval 类似的,机器翻译输出的结果也可以包含同样的信息。甚至系统输出英文译文之后,还有一个额外的步骤来把部分英文单词的大小写恢复出来,比如,上例中句首单词Cats的首字母要大写。
\parinterval 一般来说,在送入机器翻译系统前需要对文字序列进行处理和加工,这个过程被称为{\small\sffamily\bfseries{预处理}}(Pre-processing)。同理,在机器翻译模型输出译文后的处理作被称作{\small\sffamily\bfseries{后处理}}(Post-processing)。这两个过程对机器翻译性能影响很大,比如,在神经机器翻译里,不同的分词策略可能会造成翻译性能的天差地别。
\parinterval 一般来说,在送入机器翻译系统前需要对文字序列进行处理和加工,这个过程被称为{\small\sffamily\bfseries{预处理}}\index{预处理}(Pre-processing)\index{Pre-processing}。同理,在机器翻译模型输出译文后的处理作被称作{\small\sffamily\bfseries{后处理}}\index{后处理}(Post-processing)\index{Post-processing}。这两个过程对机器翻译性能影响很大,比如,在神经机器翻译里,不同的分词策略可能会造成翻译性能的天差地别。
\parinterval 值得注意的是,有些观点认为,不论是分词还是句法分析,对于机器翻译来说并不要求符合人的认知和语言学约束。换句话说,机器翻译所使用的``单词''和``结构''本身并不是为了符合人类的解释,它们更直接目的是为了进行翻译。从系统开发的角度,有时候即使进行一些与人类的语言习惯有差别的处理,仍然会带来性能的提升,比如在神经机器翻译中,在传统分词的基础上进一步使用双字节编码(Byte Pair Encoding,BPE)子词切分会使得机器翻译性能大幅提高。当然,自然语言处理中语言学信息的使用一直是学界关注的焦点。甚至关于语言学结构对机器翻译是否有作用这个问题也有争论。但是不能否认的是,无论是语言学的知识,还是计算机自己学习到的知识,对机器翻译都是有价值的。在后续章节会看到,这两种类型的知识对机器翻译帮助很大 \footnote[1]{笔者并不认同语言学结构对机器翻译的帮助有限,相反机器翻译需要更多的人类先验知识的指导。当然,这个问题不是这里讨论的重点。}
......@@ -67,21 +67,22 @@
\parinterval 本章将会对上述问题及求解问题的方法进行介绍。首先,会用一个例子给出统计建模的基本思路,之后会应用这种方法进行中文分词、语言建模和句法分析。
\vspace{-1em}
%--概率论基础-----------------------------------------
\section{概率论基础}\index{Chapter2.2}
\sectionnewpage
\section{概率论基础}
\parinterval 为了便于后续内容的介绍,首先对本书中使用的概率和统计学概念进行说明。
%--随机变量和概率---------------------
\subsection{随机变量和概率}\index{Chapter2.2.1}
\parinterval 在自然界中,很多{\small\bfnew{事件}}(Event)是否会发生是不确定的。例如,明天会下雨、掷一枚硬币是正面朝上、扔一个骰子的点数是5$\cdots\cdots$这类事件可能会发生也可能不会发生。通过大量的重复试验,能发现其具有某种规律性的事件叫做{\small\sffamily\bfseries{随机事件}}
\subsection{随机变量和概率}
\parinterval 在自然界中,很多{\small\bfnew{事件}}\index{事件}(Event)\index{Event}是否会发生是不确定的。例如,明天会下雨、掷一枚硬币是正面朝上、扔一个骰子的点数是5$\cdots\cdots$这类事件可能会发生也可能不会发生。通过大量的重复试验,能发现其具有某种规律性的事件叫做{\small\sffamily\bfseries{随机事件}}\index{随机事件}
\parinterval {\small\sffamily\bfseries{随机变量}}(Random Variable)是对随机事件发生可能状态的描述,是随机事件的数量表征。设$\Omega = \{ \omega \}$为一个随机试验的样本空间,$X=X(\omega)$就是定义在样本空间$\omega$上的单值实数函数,即$X=X(\omega)$为随机变量,记为$X$。随机变量是一种能随机选取数值的变量,常用大写的英文字母或希腊字母表示,其取值通常用小写字母来表示。例如,用$A$ 表示一个随机变量,用$a$表示变量$A$的一个取值。根据随机变量可以选取的值,可以将其划分为离散变量和连续变量。
\parinterval {\small\sffamily\bfseries{随机变量}}\index{随机变量}(Random Variable)\index{Random Variable}是对随机事件发生可能状态的描述,是随机事件的数量表征。设$\Omega = \{ \omega \}$为一个随机试验的样本空间,$X=X(\omega)$就是定义在样本空间$\omega$上的单值实数函数,即$X=X(\omega)$为随机变量,记为$X$。随机变量是一种能随机选取数值的变量,常用大写的英文字母或希腊字母表示,其取值通常用小写字母来表示。例如,用$A$ 表示一个随机变量,用$a$表示变量$A$的一个取值。根据随机变量可以选取的值,可以将其划分为离散变量和连续变量。
\parinterval 离散变量是指在其取值区间内可以被一一列举,总数有限并且可计算的数值变量。例如,用随机变量$X$代表某次投骰子出现的点数,点数只可能取1$\sim$6这6个整数,$X$就是一个离散变量。
\parinterval 连续变量是在其取值区间内连续取值,无法被一一列举,具有无限个取值的变量。例如,图书馆的开馆时间是8:30-22:00,用$X$代表某人进入图书馆的时间,时间的取值范围是[8:30,22:00]这个时间区间,$X$就是一个连续变量。
\parinterval {\small\bfnew{概率}}(Probability)是度量随机事件呈现其每个可能状态的可能性的数值,本质上它是一个测度函数\cite{mao-prob-book-2011}\cite{kolmogorov2018foundations}。概率的大小表征了随机事件在一次试验中发生的可能性大小。用$\textrm{P}(\cdot )$表示一个随机事件的可能性,即事件发生的概率。比如$\textrm{P}(\textrm{太阳从东方升起})$表示``太阳从东方升起的可能性'',同理,$\textrm{P}(A=B)$ 表示的就是``$A=B$'' 这件事的可能性。
\parinterval {\small\bfnew{概率}}\index{概率}(Probability)\index{Probability}是度量随机事件呈现其每个可能状态的可能性的数值,本质上它是一个测度函数\cite{mao-prob-book-2011}\cite{kolmogorov2018foundations}。概率的大小表征了随机事件在一次试验中发生的可能性大小。用$\textrm{P}(\cdot )$表示一个随机事件的可能性,即事件发生的概率。比如$\textrm{P}(\textrm{太阳从东方升起})$表示``太阳从东方升起的可能性'',同理,$\textrm{P}(A=B)$ 表示的就是``$A=B$'' 这件事的可能性。
\parinterval 在实际问题中,往往需要得到随机变量的概率值。但是,真实的概率值可能是无法准确知道的,这时就需要对概率进行{\small\sffamily\bfseries{估计}},得到的结果是概率的{\small\sffamily\bfseries{估计值}}(Estimate)。在概率论中,一个很简单的方法是利用相对频度作为概率的估计值。如果$\{x_1,x_2,\dots,x_n \}$是一个试验的样本空间,在相同情况下重复试验$N$次,观察到样本$x_i (1\leq{i}\leq{n})$的次数为$n (x_i )$,那么$x_i$在这$N$次试验中的相对频率是$\frac{n(x_i )}{N}$。当$N$越来越大时,相对概率也就越来越接近真实概率$\textrm{P}(x_i)$,即$\lim_{N \to \infty}\frac{n(x_i )}{N}=\textrm{P}(x_i)$。 实际上,很多概率模型都等同于相对频度估计,比如,对于一个服从多项式分布的变量的极大似然估计就可以用相对频度估计实现。
\parinterval 在实际问题中,往往需要得到随机变量的概率值。但是,真实的概率值可能是无法准确知道的,这时就需要对概率进行{\small\sffamily\bfseries{估计}}\index{估计},得到的结果是概率的{\small\sffamily\bfseries{估计值}}\index{估计值}(Estimate)\index{Estimate}。在概率论中,一个很简单的方法是利用相对频度作为概率的估计值。如果$\{x_1,x_2,\dots,x_n \}$是一个试验的样本空间,在相同情况下重复试验$N$次,观察到样本$x_i (1\leq{i}\leq{n})$的次数为$n (x_i )$,那么$x_i$在这$N$次试验中的相对频率是$\frac{n(x_i )}{N}$。当$N$越来越大时,相对概率也就越来越接近真实概率$\textrm{P}(x_i)$,即$\lim_{N \to \infty}\frac{n(x_i )}{N}=\textrm{P}(x_i)$。 实际上,很多概率模型都等同于相对频度估计,比如,对于一个服从多项式分布的变量的极大似然估计就可以用相对频度估计实现。
\parinterval 概率函数是用函数形式给出离散变量每个取值发生的概率,其实就是将变量的概率分布转化为数学表达形式。如果把$A$看做一个离散变量,$a$看做变量$A$的一个取值,那么$\textrm{P}(A)$被称作变量$A$的概率函数,$\textrm{P}(A=a)$被称作$A = a$的概率值,简记为$\textrm{P}(a)$。例如,在相同条件下掷一个骰子50次,用$A$表示投骰子出现的点数这个离散变量,$a_i$表示点数的取值,$\textrm{P}_i$表示$A=a_i$的概率值。下表为$A$的概率分布,给出了$A$的所有取值及其概率。
%表1--------------------------------------------------------------------
......@@ -99,7 +100,7 @@
\parinterval 除此之外,概率函数$\textrm{P}(\cdot)$还具有非负性、归一性等特点,非负性是指,所有的概率函数$\textrm{P}(\cdot)$都必须是大于等于0的数值,概率函数中不可能出现负数:$\forall{x},\textrm{P}{(x)}\geq{0}$。归一性,又称规范性,简单的说就是所有可能发生的事件的概率总和为1,即$\sum_{x}\textrm{P}{(x)}={1}$
\parinterval 对于离散变量$A$$\textrm{P}(A=a)$是个确定的值,可以表示事件$A=a$的可能性大小;而对于连续变量,求在某个定点处的概率是无意义的,只能求其落在某个取值区间内的概率。因此,用{\small\sffamily\bfseries{概率分布函数$F(x)$}}{\small\sffamily\bfseries{概率密度函数}}$f(x)$来统一描述随机变量取值的分布情况。概率分布函数$F(x)$表示取值小于某个值的概率,是概率的累加(或积分)形式。假设$A$是一个随机变量,$a$是任意实数,将函数$F(a)=\textrm{P}\{A\leq a\}$$-\infty<a<\infty $定义为$A$的分布函数。通过分布函数,可以清晰地表示任何随机变量的概率。
\parinterval 对于离散变量$A$$\textrm{P}(A=a)$是个确定的值,可以表示事件$A=a$的可能性大小;而对于连续变量,求在某个定点处的概率是无意义的,只能求其落在某个取值区间内的概率。因此,用{\small\sffamily\bfseries{概率分布函数}}\index{概率分布函数}$F(x)${\small\sffamily\bfseries{概率密度函数}}\index{概率密度函数}$f(x)$来统一描述随机变量取值的分布情况。概率分布函数$F(x)$表示取值小于某个值的概率,是概率的累加(或积分)形式。假设$A$是一个随机变量,$a$是任意实数,将函数$F(a)=\textrm{P}\{A\leq a\}$$-\infty<a<\infty $定义为$A$的分布函数。通过分布函数,可以清晰地表示任何随机变量的概率。
\parinterval 概率密度函数反映了变量在某个区间内的概率变化快慢,概率密度函数的值是概率的变化率,该连续变量的概率也就是对概率密度函数求积分得到的结果。设$f(x) \geq 0$是连续变量$X$的概率密度函数,$X$的分布函数就可以用如下公式定义:
......@@ -118,9 +119,8 @@ F(X)=\int_{-\infty}^x f(x)dx
\label{fig:2.2-1}
\end{figure}
%-------------------------------------------
\subsection{联合概率、条件概率和边缘概率}\index{Chapter2.2.2}
\parinterval {\small\sffamily\bfseries{联合概率}}(Joint Probability)是指多个事件同时发生,每个随机变量满足各自条件的概率,表示为$\textrm{P}(AB)${\small\sffamily\bfseries{条件概率}}(Conditional Probability)是指$A$$B$为任意的两个事件,在事件$A$已出现的前提下,事件$B$出现的概率,使用$\textrm{P}(B \mid A)$表示。通常来说,$\textrm{P}(B \mid A) \neq \textrm{P}(B)$
\subsection{联合概率、条件概率和边缘概率}
\parinterval {\small\sffamily\bfseries{联合概率}}\index{联合概率}(Joint Probability)\index{Joint Probability}是指多个事件同时发生,每个随机变量满足各自条件的概率,表示为$\textrm{P}(AB)${\small\sffamily\bfseries{条件概率}}\index{条件概率}(Conditional Probability)\index{Conditional Probability}是指$A$$B$为任意的两个事件,在事件$A$已出现的前提下,事件$B$出现的概率,使用$\textrm{P}(B \mid A)$表示。通常来说,$\textrm{P}(B \mid A) \neq \textrm{P}(B)$
\parinterval 贝叶斯法则是条件概率计算时的重要依据,条件概率可以表示为
......@@ -133,7 +133,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{eqnarray}
%----------------------------------------------
\parinterval {\small\sffamily\bfseries{边缘概率}}(marginal probability)是和联合概率对应的,它指的是$\textrm{P}(X=a)$$\textrm{P}(Y=b)$,即仅与单个随机变量有关的概率称为边缘概率。对于离散随机变量$X$$Y$,如果知道$\textrm{P}(X,Y)$,则边缘概率$\textrm{P}(X)$可以通过求和的方式得到。对于$\forall x \in X $,有
\parinterval {\small\sffamily\bfseries{边缘概率}}\index{边缘概率}(marginal probability)\index{marginal probability}是和联合概率对应的,它指的是$\textrm{P}(X=a)$$\textrm{P}(Y=b)$,即仅与单个随机变量有关的概率称为边缘概率。对于离散随机变量$X$$Y$,如果知道$\textrm{P}(X,Y)$,则边缘概率$\textrm{P}(X)$可以通过求和的方式得到。对于$\forall x \in X $,有
\begin{eqnarray}
\textrm{P}(X=x)=\sum_{y} \textrm{P}(X=x,Y=y)
\label{eq:2.2-2}
......@@ -167,7 +167,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\subsection{链式法则}\index{Chapter2.2.3}
\subsection{链式法则}
\parinterval 条件概率公式$\textrm{P}(a \mid b)=\textrm{P}(ab)/\textrm{P}(b)$反应了事件$b$发生的条件下事件$a$发生的概率。如果将其推广到三个事件$a$$b$$c$,为了计算$\textrm{P}(a,b,c)$,我们可以运用两次$\textrm{P}(a \mid b)=\textrm{P}(ab)/\textrm{P}(b)$,计算过程如下:
\begin{eqnarray}
......@@ -213,9 +213,9 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{eqnarray}
%---------------------------------------------
\subsection{贝叶斯法则}\index{Chapter2.2.4}
\subsection{贝叶斯法则}
\parinterval 首先介绍一下全概率公式:{\small\bfnew{全概率公式}}(Law of Total Probability)是概率论中重要的公式,它可以将一个复杂事件发生的概率分解成不同情况的小事件发生概率的和。这里先介绍一个概念——划分。若集合$S$的一个划分事件为$\{B_1,...,B_n\}$是指它们满足$\bigcup_{i=1}^n B_i=S \textrm{}B_iB_j=\varnothing , i,j=1,...,n,i\neq j$。设$\{B_1,...,B_n\}$$S$的一个划分,则事件$A$的全概率公式可以被描述为:
\parinterval 首先介绍一下全概率公式:{\small\bfnew{全概率公式}}\index{全概率公式}(Law of Total Probability)\index{Law of Total Probability}是概率论中重要的公式,它可以将一个复杂事件发生的概率分解成不同情况的小事件发生概率的和。这里先介绍一个概念——划分。若集合$S$的一个划分事件为$\{B_1,...,B_n\}$是指它们满足$\bigcup_{i=1}^n B_i=S \textrm{}B_iB_j=\varnothing , i,j=1,...,n,i\neq j$。设$\{B_1,...,B_n\}$$S$的一个划分,则事件$A$的全概率公式可以被描述为:
%---------------------------------------------
\begin{eqnarray}
......@@ -242,7 +242,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{eqnarray}
%--------------------------------------------
\parinterval {\small\sffamily\bfseries{贝叶斯法则}}(Bayes' rule)是概率论中的一个经典公式,通常用于已知$\textrm{P}(A \mid B)$$\textrm{P}(B \mid A)$。可以表述为:设$\{B_1,...,B_n\}$$S$的一个划分,$A$为事件,则对于$i=1,...,n$,有如下公式
\parinterval {\small\sffamily\bfseries{贝叶斯法则}}\index{贝叶斯法则}(Bayes' rule)\index{Bayes' rule}是概率论中的一个经典公式,通常用于已知$\textrm{P}(A \mid B)$$\textrm{P}(B \mid A)$。可以表述为:设$\{B_1,...,B_n\}$$S$的一个划分,$A$为事件,则对于$i=1,...,n$,有如下公式
%--------------------------------------------
\begin{eqnarray}
\textrm{P}(B_i \mid A) & = & \frac {\textrm{P}(A B_i)} { \textrm{P}(A) } \nonumber \\
......@@ -262,11 +262,11 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 贝叶斯公式常用于根据已知的结果来推断使之发生的各因素的可能性。
\subsection{KL距离和熵}\index{Chapter2.2.5}
\subsection{KL距离和熵}
\subsubsection{信息熵}\index{Chapter2.2.5.1}
\subsubsection{信息熵}
\parinterval {\small\sffamily\bfseries{}}(Entropy)是热力学中的一个概念,同时也是对系统无序性的一种度量标准。在自然语言处理领域也会使用到信息熵这一概念,比如描述文字的信息量大小。一条信息的信息量可以被看作是这条信息的不确定性。如果需要确认一件非常不确定甚至于一无所知的事情,那么需要理解大量的相关信息才能进行确认;同样的,如果对某件事已经非常确定,那么就不需要太多的信息就可以把它搞清楚。如下就是两个例子,
\parinterval {\small\sffamily\bfseries{}}\index{}(Entropy)\index{Entropy}是热力学中的一个概念,同时也是对系统无序性的一种度量标准。在自然语言处理领域也会使用到信息熵这一概念,比如描述文字的信息量大小。一条信息的信息量可以被看作是这条信息的不确定性。如果需要确认一件非常不确定甚至于一无所知的事情,那么需要理解大量的相关信息才能进行确认;同样的,如果对某件事已经非常确定,那么就不需要太多的信息就可以把它搞清楚。如下就是两个例子,
\begin{example}
确定性和不确定性的事件
......@@ -277,7 +277,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\label{e.g:2.2-1}
\end{example}
\parinterval 在这两句话中,``太阳从东方升起''是一件确定性事件(在地球上),几乎不需要查阅更多信息就可以确认,因此这件事的信息熵相对较低;而``明天天气多云''这件事,需要关注天气预报,才能大概率确定这件事,它的不确定性很高,因而它的信息熵也就相对较高。因此,信息熵也是对事件不确定性的度量。进一步,定义{\small\bfnew{自信息}}(Self-information)为一个事件$X$的自信息的表达式为:
\parinterval 在这两句话中,``太阳从东方升起''是一件确定性事件(在地球上),几乎不需要查阅更多信息就可以确认,因此这件事的信息熵相对较低;而``明天天气多云''这件事,需要关注天气预报,才能大概率确定这件事,它的不确定性很高,因而它的信息熵也就相对较高。因此,信息熵也是对事件不确定性的度量。进一步,定义{\small\bfnew{自信息}}\index{自信息}(Self-information)\index{Self-information}为一个事件$X$的自信息的表达式为:
\begin{eqnarray}
\textrm{I}(x)=-\log\textrm{P}(x)
\label{eq:2.2-17}
......@@ -304,9 +304,9 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 一个分布的信息熵也就是从该分布中得到的一个事件的期望信息量。比如,$a$$b$$c$$d$四支球队,四支队伍夺冠的概率分别是$P_1$$P_2$$P_3$$P_4$,某个人对比赛不感兴趣但是又想知道哪只球队夺冠,通过使用二分法2次就确定哪支球队夺冠了。但假设这四只球队中$c$的实力可以碾压其他球队,那么猜1次就可以确定。所以对于前面这种情况,哪只球队夺冠的信息量较高,信息熵也相对较高;对于后面这种情况,因为结果是容易猜到的,信息量和信息熵也就相对较低。因此可以得知:分布越尖锐熵越低;分布越均匀熵越高。
\subsubsection{KL距离}\index{Chapter2.2.5.2}
\subsubsection{KL距离}
\parinterval 如果同一个随机变量$X$上有两个独立的概率分布P$(x)$和Q$(x)$,那么可以使用KL距离(``Kullback-Leibler''散度)来衡量这两个分布的不同,这种度量就是{\small\bfnew{相对熵}}(Relative Entropy)。其公式如下:
\parinterval 如果同一个随机变量$X$上有两个独立的概率分布P$(x)$和Q$(x)$,那么可以使用KL距离(``Kullback-Leibler''散度)来衡量这两个分布的不同,这种度量就是{\small\bfnew{相对熵}}\index{相对熵}(Relative Entropy)\index{Relative Entropy}。其公式如下:
\begin{eqnarray}
\textrm{D}_{\textrm{KL}}(\textrm{P}\parallel \textrm{Q}) & = & \sum_{x \in \textrm{X}} [ \textrm{P}(x)\log \frac{\textrm{P}(x) }{ \textrm{Q}(x) } ] \nonumber \\
......@@ -324,9 +324,9 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{itemize}
\vspace{0.5em}
\subsubsection{交叉熵}\index{Chapter2.2.5.3}
\subsubsection{交叉熵}
\parinterval {\small\bfnew{交叉熵}}(Cross-entropy)是一个与KL距离密切相关的概念,它的公式是:
\parinterval {\small\bfnew{交叉熵}}\index{交叉熵}(Cross-entropy)\index{Cross-entropy}是一个与KL距离密切相关的概念,它的公式是:
\begin{eqnarray}
\textrm{H}(\textrm{P},\textrm{Q})=-\sum_{x \in \textrm{X}} [\textrm{P}(x) \log \textrm{Q}(x) ]
\label{eq:2.2-20}
......@@ -335,9 +335,10 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 结合相对熵公式可知,交叉熵是KL距离公式中的右半部分。因此,求关于Q的交叉熵的最小值等价于求KL距离的最小值。从实践的角度来说,交叉熵与KL距离的目的相同:都是用来描述两个分布的差异,由于交叉熵计算上更加直观方便,因此在机器翻译中被广泛应用。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{中文分词}\index{Chapter2.3}
\sectionnewpage
\section{中文分词}
\parinterval 对于机器翻译系统而言,输入的是已经切分好的单词序列,而不是原始的字符串(图\ref{fig:2.3-1})。比如,对于一个中文句子,单词之间是没有间隔的,因此需要把一个个的单词切分出来,这样机器翻译系统可以区分不同的翻译单元。甚至,可以对语言学上的单词进行进一步切分,得到词片段序列(比如:中国人$\to$中国/人)。可以把上述过程看作是一种{\small\sffamily\bfseries{分词}}(Segmentation)过程,即:将一个输入的自然语言字符串切割成单元序列(token序列),每个单元都对应可以处理的最小单位。
\parinterval 对于机器翻译系统而言,输入的是已经切分好的单词序列,而不是原始的字符串(图\ref{fig:2.3-1})。比如,对于一个中文句子,单词之间是没有间隔的,因此需要把一个个的单词切分出来,这样机器翻译系统可以区分不同的翻译单元。甚至,可以对语言学上的单词进行进一步切分,得到词片段序列(比如:中国人$\to$中国/人)。可以把上述过程看作是一种{\small\sffamily\bfseries{分词}}\index{分词}(Segmentation)\index{Segmentation}过程,即:将一个输入的自然语言字符串切割成单元序列(token序列),每个单元都对应可以处理的最小单位。
%----------------------------------------------
% 图2.7
......@@ -349,7 +350,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{figure}
%-------------------------------------------
%\vspace{-0.5em}
\parinterval 分词得到的单元序列可以是语言学上的词序列,也可以是根据其他方式定义的基本处理单元。在本章中,可以把分词得到的一个个单元称为{\small\bfnew{单词}}(Word),或{\small\bfnew{}},尽管这些单元可以不是语言学上的完整单词。而这个过程也被称作{\small\bfnew{词法分析}}(Lexical Analysis)。除了汉语,词法分析在日语、泰语等单词之间无明确分割符的语言中有着广泛的应用,芬兰语、维吾尔语等一些形态学十分丰富的语言,也需要使用词法分析来解决复杂的词尾、词缀变化等形态学变化。
\parinterval 分词得到的单元序列可以是语言学上的词序列,也可以是根据其他方式定义的基本处理单元。在本章中,可以把分词得到的一个个单元称为{\small\bfnew{单词}}\index{单词}(Word)\index{Word},或{\small\bfnew{}}\index{},尽管这些单元可以不是语言学上的完整单词。而这个过程也被称作{\small\bfnew{词法分析}}\index{词法分析}(Lexical Analysis)\index{Lexical Analysis}。除了汉语,词法分析在日语、泰语等单词之间无明确分割符的语言中有着广泛的应用,芬兰语、维吾尔语等一些形态学十分丰富的语言,也需要使用词法分析来解决复杂的词尾、词缀变化等形态学变化。
\parinterval 在机器翻译中,分词系统的好坏往往会决定译文的质量。分词的目的是定义系统处理的基本单元,那么什么叫做``词''呢?关于词的定义有很多,比如:
......@@ -386,8 +387,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 词法分析的重要性在自然语言处理领域已经有共识。如果切分的颗粒度很大,获得的单词的歧义也很小,比如``中华人民共和国''整体作为一个单词不存在歧义,而如果单独的一个单词``国'',可能会代表``中国''、``美国''等不同的国家,存在歧义。但是随着切分颗粒度的增大,特定单词出现的频度也随之降低,低频词容易和噪音混淆,系统很难进行学习。因此,处理这些问题并开发适合翻译任务的分词系统是机器翻译的第一步。
\subsection{基于词典的分词方法}\index{Chapter2.3.1}
\subsection{基于词典的分词方法}
\parinterval 然而,计算机并不能像人类一样在概念上理解``词'',因此需要使用其他方式让计算机可以进行分词。一个最简单的方法就是给定一个词典,在这个词典中出现的汉字组合就是所定义的``词''。也就是,通过一个词典定义一个标准,符合这个标准定义的字符串都是合法的``词''。
\parinterval 在使用基于词典的分词方法时,只需预先加载词典到计算机中,扫描输入句子,查询每个词串是否出现在词典中。如图\ref{fig:2.3-2} 所示,有一个包含六个词的词典,给定输入句子`` 确实现在物价很高''后,分词系统自左至右遍历输入句子的每个字,发现词串``确实''在词典中出现,说明``确实''是一个``词'',进行分词操作并在切分该``词''之后重复这个过程。
......@@ -417,19 +417,19 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 基于词典的分词方法是典型的基于规则的方法,完全依赖于人工给定的词典。在遇到歧义时,需要人工定义消除歧义的规则,比如,可以自左向右扫描每次匹配最长的单词,这是一种简单的启发式的消歧策略。图\ref{fig:2.3-2}中的例子实际上就是使用这种策略得到的分词结果。但是,启发式的消岐方法对人工的依赖程度很高,而且启发式规则也不能处理所有的情况。所以说简单的基于词典的方法还不能很好的解决分词问题。
\subsection{基于统计的分词方法}\label{sec2:statistical-seg}\index{Chapter2.3.2}
\subsection{基于统计的分词方法}\label{sec2:statistical-seg}
\parinterval 既然基于词典的方法有很多问题,那么就需要一种更为有效的方法。在上文中提到,想要搭建一个分词系统,需要让计算机知道什么是``词'',那么可不可以给出已经切分好的分词数据,让计算机在这些数据中学习到规律呢?答案是肯定的,利用``数据''来让计算机明白``词''的定义,让计算机直接在数据中学到知识,这就常说的数据驱动的方法。这个过程也是一个典型的基于统计建模的学习过程。
\subsubsection{统计模型的学习与推断}\index{Chapter2.3.2.1}
\subsubsection{统计模型的学习与推断}
\parinterval 在分词任务中,数据驱动主要指用已经分词切分好的数据``喂''给系统,这个数据也被称作{\small\bfnew{标注数据}}(Annotated Data)。在获得标注数据后,系统自动学习一个统计模型来描述分词的过程,而这个模型会把分词的`` 知识''作为参数保存在模型中。当送入一个新的需要分词的句子时,可以利用学习到的模型对所有可能的分词结果进行预测,并进行概率化的描述,最终选择概率最大的结果作为输出。这个方法就是基于统计的分词方法。具体来说,可以分为两个步骤:
\parinterval 在分词任务中,数据驱动主要指用已经分词切分好的数据``喂''给系统,这个数据也被称作{\small\bfnew{标注数据}}\index{标注数据}(Annotated Data)\index{Annotated Data}。在获得标注数据后,系统自动学习一个统计模型来描述分词的过程,而这个模型会把分词的`` 知识''作为参数保存在模型中。当送入一个新的需要分词的句子时,可以利用学习到的模型对所有可能的分词结果进行预测,并进行概率化的描述,最终选择概率最大的结果作为输出。这个方法就是基于统计的分词方法。具体来说,可以分为两个步骤:
\vspace{0.5em}
\begin{itemize}
\item {\small\bfnew{训练}}(Training)。利用标注数据,对统计模型的参数进行学习。
\item {\small\bfnew{训练}}\index{训练}(Training)\index{Training}。利用标注数据,对统计模型的参数进行学习。
\vspace{0.5em}
\item {\small\bfnew{推断}}(Inference)。利用学习到的模型和参数,对新的句子进行切分。
\item {\small\bfnew{推断}}\index{推断}(Inference)\index{Inference}。利用学习到的模型和参数,对新的句子进行切分。
\end{itemize}
\vspace{0.5em}
......@@ -447,7 +447,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval\ref{fig:2.3-4} 给出了一个基于统计建模的汉语分词实例。左侧是标注数据,其中每个句子是已经经过人工标注的分词结果(单词用斜杠分开)。之后,建立一个统计模型,记为$\textrm{P}(\cdot)$。模型通过在标注数据上的学习来对问题进行描述,即学习$\textrm{P}(\cdot)$。最后,对于新的未分词的句子,使用模型$\textrm{P}(\cdot)$对每个可能的切分方式进行概率估计,之后选择概率最高的切分结果输出。
\vspace{-0.5em}
\subsubsection{掷骰子游戏}\index{Chapter2.3.2.2}
\subsubsection{掷骰子游戏}
\parinterval 上述过程的核心在于从数据中学习一种对分词现象的统计描述,即学习函数$\textrm{P}(\cdot)$。如何让计算机利用分词好的数据学习到分词的知识呢?可以先看一个有趣的实例,用生活中比较常见的掷骰子来说,掷一个骰子,玩家猜一个数字,猜中就算赢,按照一般的常识,随便选一个数字,获胜的概率是一样的,即所有选择的获胜概率仅是$1/6$。因此这个游戏玩家很难获胜,除非运气很好。假设进行一次游戏,玩家随便选了一个数字,比如是1,投掷30骰子,得到命中$7/30 > 1/6$,还不错。
\vspace{-0.5em}
......@@ -482,7 +482,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\label{eq:2.3-2}
\end{eqnarray}
\noindent 这里,$\theta_1 \sim \theta_5$可以被看作是模型的参数,因此这个模型的自由度是5。对于这样的模型,参数确定了,模型也就确定了。但是,新的问题来了,在定义骰子每个面的概率后,如何求出具体的值呢?一种常用的方法是,从大量实例中学习模型参数,这个方法也是常说的{\small\bfnew{参数估计}}(Parameter Estimation)。可以将这个不均匀的骰子先实验性的掷很多次,这可以被看作是独立同分布的若干次采样,比如$X$ 次,发现``1'' 出现$X_1$ 次,``2'' 出现$X_2$ 次,以此类推,得到了各个面出现的次数。假设掷骰子中每个面出现的概率符合多项式分布,通过简单的概率论知识可以知道每个面出现概率的极大似然估计为:
\noindent 这里,$\theta_1 \sim \theta_5$可以被看作是模型的参数,因此这个模型的自由度是5。对于这样的模型,参数确定了,模型也就确定了。但是,新的问题来了,在定义骰子每个面的概率后,如何求出具体的值呢?一种常用的方法是,从大量实例中学习模型参数,这个方法也是常说的{\small\bfnew{参数估计}}\index{参数估计}(Parameter Estimation)\index{Parameter Estimation}。可以将这个不均匀的骰子先实验性的掷很多次,这可以被看作是独立同分布的若干次采样,比如$X$ 次,发现``1'' 出现$X_1$ 次,``2'' 出现$X_2$ 次,以此类推,得到了各个面出现的次数。假设掷骰子中每个面出现的概率符合多项式分布,通过简单的概率论知识可以知道每个面出现概率的极大似然估计为:
\begin{eqnarray}
\textrm{P(``i'')}=\frac {X_i}{X}
\label{eq:2.3-3}
......@@ -512,9 +512,9 @@ F(X)=\int_{-\infty}^x f(x)dx
\end{figure}
%-------------------------------------------
\parinterval 通过上面这个掷骰子的游戏,可以得到一个道理:{\small\sffamily\bfseries{上帝是不公平的}}。因为在``公平''的世界中,没有任何一个模型可以学到有价值的事情。从机器学习的角度来看,所谓的``不公平''实际上这是客观事物中蕴含的一种{\small\sffamily\bfseries{偏置}}(Bias),也就是很多事情天然就有对某些情况有倾向。而图像处理、自然语言处理等问题中绝大多数都存在着偏置。比如,我们翻译一个英文单词的时候,它最可能的翻译结果往往就是那几个词。设计统计模型的目的正是要学习这种偏置,之后利用这种偏置对新的问题做出足够好的决策。
\parinterval 通过上面这个掷骰子的游戏,可以得到一个道理:{\small\sffamily\bfseries{上帝是不公平的}}。因为在``公平''的世界中,没有任何一个模型可以学到有价值的事情。从机器学习的角度来看,所谓的``不公平''实际上这是客观事物中蕴含的一种{\small\sffamily\bfseries{偏置}}\index{偏置}(Bias)\index{Bias},也就是很多事情天然就有对某些情况有倾向。而图像处理、自然语言处理等问题中绝大多数都存在着偏置。比如,我们翻译一个英文单词的时候,它最可能的翻译结果往往就是那几个词。设计统计模型的目的正是要学习这种偏置,之后利用这种偏置对新的问题做出足够好的决策。
\subsubsection{全概率分词方法}\index{Chapter2.3.2.3}
\subsubsection{全概率分词方法}
\parinterval 回到分词的问题上。与掷骰子游戏类似,分词系统的统计学原理也可以这么理解:假设有已经人工分词好的句子,其中每个单词的出现就好比掷一个巨大的骰子,与前面的例子中有所不同的是:
......@@ -612,14 +612,15 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 当然,真正的分词系统还需要解决很多其他问题,比如使用动态规划等方法高效搜索最优解以及如何处理未见过的词等等,由于本节的重点是介绍中文分词的基础方法和统计建模思想,因此无法覆盖所有中文分词的技术内容,有兴趣的读者可以参考\ref{sec2:summary}节的相关文献做进一步深入研究。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{$n$-gram语言模型 }\index{Chapter2.4}
\sectionnewpage
\section{$n$-gram语言模型 }
\parinterval 在基于统计的汉语分词模型中,我们通过``大题小做''的技巧,利用独立性假设把整个句子的单词切分概率转化为每个单个词出现概率的乘积。这里,每个单词也被称作1-gram(或uni-gram),而1-gram概率的乘积实际上也是在度量词序列出现的可能性(记为$\textrm{P}(w_1 w_2...w_m)$)。这种计算整个单词序列概率$\textrm{P}(w_1 w_2...w_m)$的方法被称为统计语言模型。1-gram语言模型是最简单的一种语言模型,它没有考虑任何的上下文。很自然的一个问题是:能否考虑上下文信息构建更强大的语言模型,进而得到更准确的分词结果。下面将进一步介绍更加通用的$n$-gram语言模型,它在机器翻译及其他自然语言处理任务中有更加广泛的应用。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{建模}\index{Chapter2.4.1}
\subsection{建模}
\parinterval {\small\sffamily\bfseries{语言模型}}(Language Model)的目的是描述文字序列出现的规律。这个对问题建模的过程被称作{\small\sffamily\bfseries{语言建模}}(Language Modeling)。如果使用统计建模的方式,语言模型可以被定义为计算$\textrm{P}(w_1 w_2...w_m)$的问题,也就是计算整个词序列$w_1 w_2...w_m$出现的可能性大小。具体定义如下,
\parinterval {\small\sffamily\bfseries{语言模型}}\index{语言模型}(Language Model)\index{Language Model}的目的是描述文字序列出现的规律。这个对问题建模的过程被称作{\small\sffamily\bfseries{语言建模}}\index{语言建模}(Language Modeling)\index{Language Modeling}。如果使用统计建模的方式,语言模型可以被定义为计算$\textrm{P}(w_1 w_2...w_m)$的问题,也就是计算整个词序列$w_1 w_2...w_m$出现的可能性大小。具体定义如下,
%----------------------------------------------
% 定义3.1
......@@ -667,7 +668,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\vspace{0.3em}
\begin{itemize}
\item {\small\bfnew{极大似然估计}}。直接利用词序列在训练数据中出现的频度计算出$\textrm{P}(w_m|w_{m-n+1}$\\$... w_{m-1})$
\item {\small\bfnew{极大似然估计}}\index{极大似然估计}。直接利用词序列在训练数据中出现的频度计算出$\textrm{P}(w_m|w_{m-n+1}$\\$... w_{m-1})$
\begin{eqnarray}
\textrm{P}(w_m|w_{m-n+1}...w_{m-1})=\frac{\textrm{count}(w_{m-n+1}...w_m)}{\textrm{count}(w_{m-n+1}...w_{m-1})}
\label{eq:2.4-3}
......@@ -676,7 +677,7 @@ F(X)=\int_{-\infty}^x f(x)dx
其中,$\textrm{count}(\cdot)$是在训练数据中统计频次的函数。
\vspace{0.3em}
\item {\small\bfnew{人工神经网络方法}}。构建一个人工神经网络估计$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$的值,比如,可以构建一个前馈神经网络来对$n$-gram进行建模。
\item {\small\bfnew{人工神经网络方法}}\index{人工神经网络方法}。构建一个人工神经网络估计$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$的值,比如,可以构建一个前馈神经网络来对$n$-gram进行建模。
\end{itemize}
\vspace{0.3em}
......@@ -694,7 +695,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval$n$-gram语言模型为代表的统计语言模型的应用非常广泛。除了分词,在文本生成、信息检索、摘要等自然语言处理任务中,语言模型都有举足轻重的地位。包括近些年非常受关注的预训练模型,本质上也是统计语言模型。这些技术都会在后续章节进行介绍。值得注意的是,统计语言模型为解决自然语言处理问题提供了一个非常好的建模思路,即:把整个序列生成的问题转化为逐个生成单词的问题。很快我们就会看到,这种建模方式会被广泛的用于机器翻译建模,在统计机器翻译和神经机器翻译中都会有明显的体现。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{未登录词和平滑算法}\label{sec2:smoothing}\index{Chapter2.4.2}
\subsection{未登录词和平滑算法}\label{sec2:smoothing}
\parinterval 在式\ref{eq:2.4-4}所示的例子中,如果语料中从没有``确实''和``现在''两个词连续出现的情况,那么使用2-gram计算切分``确实/现在/数据/很/多''的概率时,会出现如下情况
\begin{eqnarray}
......@@ -704,7 +705,7 @@ F(X)=\int_{-\infty}^x f(x)dx
\label{eq:2.4-5}
\end{eqnarray}
\parinterval 显然,这个结果是不能接受的。因为即使语料中没有 ``确实''和``现在''两个词连续出现,但是这种搭配也是客观存在的。这时简单的用极大似然估计得到概率却是0,导致整个切分结果的概率为0。 更常见的问题是那些根本没有出现在词表中的词,称为{\small\sffamily\bfseries{未登录词}}(Out-of-Vocabulary Word,OOV Word),比如一些生僻词,可能模型训练阶段从来没有看到过,这时模型仍然会给出0 概率。图\ref{fig:2.4-1}展示了一个真实语料库中词语出现频度的分布,可以看到绝大多数词都是低频词。
\parinterval 显然,这个结果是不能接受的。因为即使语料中没有 ``确实''和``现在''两个词连续出现,但是这种搭配也是客观存在的。这时简单的用极大似然估计得到概率却是0,导致整个切分结果的概率为0。 更常见的问题是那些根本没有出现在词表中的词,称为{\small\sffamily\bfseries{未登录词}}\index{未登录词}(Out-of-Vocabulary Word,OOV Word)\index{Out-of-Vocabulary Word,OOV Word},比如一些生僻词,可能模型训练阶段从来没有看到过,这时模型仍然会给出0 概率。图\ref{fig:2.4-1}展示了一个真实语料库中词语出现频度的分布,可以看到绝大多数词都是低频词。
%----------------------------------------------
% 图2.18
......@@ -721,9 +722,9 @@ F(X)=\int_{-\infty}^x f(x)dx
\parinterval 语言模型使用的平滑算法有很多。在本节中,主要介绍三种平滑方法:加法平滑法、古德-图灵估计法和Kneser-Ney平滑。这些方法也可以被应用到其他任务的概率平滑操作中。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{加法平滑方法}\index{Chapter2.4.2.1}
\subsubsection{加法平滑方法}
\parinterval {\small\bfnew{加法平滑}}(Additive Smoothing)是一种简单的平滑技术。本小节首先介绍这一方法,希望通过它了解平滑算法的思想。通常情况下,系统研发者会利用采集到的语料库来模拟真实的全部语料库。当然,没有一个语料库能覆盖所有的语言现象。常见的一个问题是,使用的语料无法涵盖所有的词汇。因此,直接依据这样语料所获得的统计信息来获取语言模型就会产生偏差。假设依据某语料$C$ (从未出现`` 确实 现在''二元语法),评估一个已经分好词的句子$S$ =``确实/现在/物价/很/高''的概率。当计算``确实/现在''的概率时,$\textrm{P}(S) = 0$。显然这个结果是不合理的。
\parinterval {\small\bfnew{加法平滑}}\index{加法平滑}(Additive Smoothing)\index{Additive Smoothing}是一种简单的平滑技术。本小节首先介绍这一方法,希望通过它了解平滑算法的思想。通常情况下,系统研发者会利用采集到的语料库来模拟真实的全部语料库。当然,没有一个语料库能覆盖所有的语言现象。常见的一个问题是,使用的语料无法涵盖所有的词汇。因此,直接依据这样语料所获得的统计信息来获取语言模型就会产生偏差。假设依据某语料$C$ (从未出现`` 确实 现在''二元语法),评估一个已经分好词的句子$S$ =``确实/现在/物价/很/高''的概率。当计算``确实/现在''的概率时,$\textrm{P}(S) = 0$。显然这个结果是不合理的。
\parinterval 加法平滑方法假设每个$n$-gram出现的次数比实际统计次数多$\theta$次,$0 \le \theta\le 1$。这样,计算概率的时候分子部分不会为0。重新计算$\textrm{P}(\textrm{现在}|\textrm{确实})$,可以得到:
......@@ -748,9 +749,9 @@ F(X)=\int_{-\infty}^x f(x)dx
%-------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{古德-图灵估计法}\index{Chapter2.4.2.2}
\subsubsection{古德-图灵估计法}
\parinterval {\small\bfnew{古德-图灵估计法}}(Good-Turing Estimate)是图灵(Alan Turing)和他的助手古德(I.J.Good)开发的,作为他们在二战期间破解德国密码机Enigma所使用的方法的一部分,在1953 年古德将其发表,这一方法也是很多平滑算法的核心,其基本思路是:把非零的$n$元语法单元的概率降低匀给一些低概率$n$元语法单元,以减小最大似然估计与真实概率之间的偏离\cite{good1953population}\cite{gale1995good}
\parinterval {\small\bfnew{古德-图灵估计法}}\index{古德-图灵估计法}(Good-Turing Estimate)\index{Good-Turing Estimate}是图灵(Alan Turing)和他的助手古德(I.J.Good)开发的,作为他们在二战期间破解德国密码机Enigma所使用的方法的一部分,在1953 年古德将其发表,这一方法也是很多平滑算法的核心,其基本思路是:把非零的$n$元语法单元的概率降低匀给一些低概率$n$元语法单元,以减小最大似然估计与真实概率之间的偏离\cite{good1953population}\cite{gale1995good}
\parinterval 假定在语料库中出现$r$次的$n$-gram有$n_r$个,特别的,出现0次的$n$-gram(即未登录词及词串)出现的次数为$n_0$个。语料库中全部词语的个数为$N$,显然
\begin{eqnarray}
......@@ -814,7 +815,7 @@ N & = & \sum_{r=0}^{\infty}{r^{*}n_r} \nonumber \\
\parinterval$r$很大的时候经常会出现$n_{r+1}=0$的情况,而且这时$n_r$也会有噪音存在。通常,简单的古德-图灵方法可能无法很好的处理这种复杂的情况,不过古德-图灵方法仍然是其他一些平滑方法的基础。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{Kneser-Ney平滑方法}\index{Chapter2.4.2.3}
\subsubsection{Kneser-Ney平滑方法}
\parinterval Kneser-Ney平滑方法是由R.Kneser和H.Ney于1995年提出的用于计算$n$元语法概率分布的方法\cite{kneser1995improved}\cite{chen1999empirical},并被广泛认为是最有效的平滑方法。这种平滑方法改进了absolute discounting中与高阶分布相结合的低阶分布的计算方法,使不同阶分布得到充分的利用。这种算法也综合利用了其他多种平滑算法的思想。
......@@ -889,14 +890,14 @@ c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
\parinterval Kneser-Ney平滑是很多语言模型工具的基础\cite{wang-etal-2018-niutrans}\cite{heafield-2011-kenlm}\cite{stolcke2002srilm}。还有很多以此为基础衍生出来的算法,感兴趣的读者可以通过参考文献自行了解\cite{parsing2009speech}\cite{ney1994structuring}\cite{chen1999empirical}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{句法分析(短语结构分析)}\index{Chapter2.5}
\sectionnewpage
\section{句法分析(短语结构分析)}
\parinterval 通过前面两节的内容,已经了解什么叫做``词''、如何对分词问题进行统计建模。同时也了解了如何对词序列的生成进行概率描述。无论是分词还是语言模型都是句子浅层词串信息的一种表示。对于一个自然语言句子来说,它更深层次的结构信息可以通过句法信息来描述,而句法信息也是机器翻译和自然语言处理其他任务中常用的知识之一。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{句子的句法树表示}\index{Chapter2.5.1}
\subsection{句子的句法树表示}
\parinterval {\small\sffamily\bfseries{句法}}(Syntax)是研究句子的每个组成部分和它们之间的组合方式。一般来说,句法和语言是相关的,比如,英文是主谓宾结构,而日语是主宾谓结构。因此不同的语言也会有不同的句法描述方式。自然语言处理领域最常用的两种句法分析形式是{\small\sffamily\bfseries{短语结构分析}}(Phrase Structure Parsing)和{\small\sffamily\bfseries{依存分析}}(Dependency Parsing)。图\ref{fig:2.5-1}展示了这两种的句法表示形式的实例。其中,左侧是短语结构树。它描述的是短语的结构功能,比如``吃''是动词(记为VV),``鱼''是名词(记为NN),``吃\ 鱼''组成动词短语,这个短语再与``喜欢''这一动词组成新的动词短语。短语结构树的每个子树都是一个句法功能单元,比如,子树VP(VV(吃) NN(鱼))就表示了``吃\ 鱼''这个动词短语的结构,其中子树根节点VP是句法功能标记。短语结构树利用嵌套的方式描述了语言学的功能。短语结构树中,每个词都有词性(或词类),不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构。短语结构分析一般也被称为{\small\bfnew{成分分析}}(Constituency Parsing)或{\small\bfnew{完全分析}}(Full Parsing)
\parinterval {\small\sffamily\bfseries{句法}}\index{句法}(Syntax)\index{Syntax}是研究句子的每个组成部分和它们之间的组合方式。一般来说,句法和语言是相关的,比如,英文是主谓宾结构,而日语是主宾谓结构。因此不同的语言也会有不同的句法描述方式。自然语言处理领域最常用的两种句法分析形式是{\small\sffamily\bfseries{短语结构分析}}\index{短语结构分析}(Phrase Structure Parsing)\index{Phrase Structure Parsing}{\small\sffamily\bfseries{依存分析}}\index{依存分析}(Dependency Parsing)\index{Dependency Parsing}。图\ref{fig:2.5-1}展示了这两种的句法表示形式的实例。其中,左侧是短语结构树。它描述的是短语的结构功能,比如``吃''是动词(记为VV),``鱼''是名词(记为NN),``吃\ 鱼''组成动词短语,这个短语再与``喜欢''这一动词组成新的动词短语。短语结构树的每个子树都是一个句法功能单元,比如,子树VP(VV(吃) NN(鱼))就表示了``吃\ 鱼''这个动词短语的结构,其中子树根节点VP是句法功能标记。短语结构树利用嵌套的方式描述了语言学的功能。短语结构树中,每个词都有词性(或词类),不同的词或者短语可以组成名动结构、动宾结构等语言学短语结构。短语结构分析一般也被称为{\small\bfnew{成分分析}}\index{成分分析}(Constituency Parsing)或{\small\bfnew{完全分析}}\index{完全分析}(Full Parsing)\index{Full Parsing}
%----------------------------------------------
% 图2.5.1.1
......@@ -910,7 +911,7 @@ c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
\parinterval\ref{fig:2.5-1}右侧展示的是另一种句法结构,被称作依存句法树。依存句法树表示了句子中单词和单词之间的依存关系。比如,从这个例子可以了解,``猫''依赖``喜欢'',``吃''依赖``喜欢'',``鱼''依赖``吃''。
\parinterval 短语结构树和依存句法树的结构和功能有很大不同。短语结构树的叶子节点是单词,中间节点是词性或者短语句法标记。在短语结构分析中,通常把单词称作{\small\bfnew{终结符}}(Terminal),把词性称为{\small\bfnew{预终结符}}(Pre-terminal),而把其他句法标记称为{\small\bfnew{非终结符}}(Non-terminal)。依存句法树没有预终结符和非终结符,所有的节点都是句子里的单词,通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的,头和尾分别指向``接受''和``发出''依存关系的词。依存关系也可以进行分类,图\ref{fig:2.5-1}中我们对每个依存关系的类型都进行了标记,这也被称作是有标记的依存分析。如果不生成这些标记,这样的句法分析被称作无标记的依存分析。
\parinterval 短语结构树和依存句法树的结构和功能有很大不同。短语结构树的叶子节点是单词,中间节点是词性或者短语句法标记。在短语结构分析中,通常把单词称作{\small\bfnew{终结符}}\index{终结符}(Terminal)\index{Terminal},把词性称为{\small\bfnew{预终结符}}\index{预终结符}(Pre-terminal)\index{Pre-terminal},而把其他句法标记称为{\small\bfnew{非终结符}}\index{非终结符}(Non-terminal)\index{Non-terminal}。依存句法树没有预终结符和非终结符,所有的节点都是句子里的单词,通过不同节点间的连线表示句子中各个单词之间的依存关系。每个依存关系实际上都是有方向的,头和尾分别指向``接受''和``发出''依存关系的词。依存关系也可以进行分类,图\ref{fig:2.5-1}中我们对每个依存关系的类型都进行了标记,这也被称作是有标记的依存分析。如果不生成这些标记,这样的句法分析被称作无标记的依存分析。
\parinterval 虽然短语结构树和依存树的句法表现形式有很大不同,但是它们在某些条件下能相互转化。比如,可以使用启发性规则将短语结构树自动转化为依存树。从应用的角度,依存分析由于形式更加简单,而且直接建模词语之间的依赖,因此在自然语言处理领域中受到很多关注。在机器翻译中,无论是哪种句法树结构,都已经被证明会对机器翻译系统产生帮助。特别是短语结构树,在机器翻译中的应用历史更长,研究更为深入,因此本节将会以短语结构分析为例介绍句法分析的相关概念。
......@@ -940,11 +941,11 @@ c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
\parinterval 以上三点是实现一个句法分析器的要素。本节的后半部分会对相关的概念和技术方法进行介绍。
\vspace{-0.5em}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{上下文无关文法}\index{Chapter2.5.2}
\subsection{上下文无关文法}
\parinterval 句法树是对句子的一种抽象。这种树形结构表达了一种对句子结构的归纳过程,比如,从树的叶子开始,把每一个树节点看作一次抽象,最终形成一个根节点。那这个过程如何用计算机来实现呢?这就需要使用到形式文法。
\parinterval 形式文法是分析自然语言的一种重要工具。根据乔姆斯基的定义\cite{chomsky2002syntactic},形式文法分为四种类型:无限制文法(0型文法)、上下文相关文法(1型文法)、上下文无关文法(2型文法)和正规文法(3型文法)。不同类型的文法有不同的应用,比如,正规文法可以用来描述有限状态自动机,因此也会被使用在语言模型等系统中。对于短语结构分析问题,常用的是{\small\bfnew{上下文无关文法}}(Context-Free Grammar)。上下文无关文法的具体形式如下:
\parinterval 形式文法是分析自然语言的一种重要工具。根据乔姆斯基的定义\cite{chomsky2002syntactic},形式文法分为四种类型:无限制文法(0型文法)、上下文相关文法(1型文法)、上下文无关文法(2型文法)和正规文法(3型文法)。不同类型的文法有不同的应用,比如,正规文法可以用来描述有限状态自动机,因此也会被使用在语言模型等系统中。对于短语结构分析问题,常用的是{\small\bfnew{上下文无关文法}}\index{上下文无关文法}(Context-Free Grammar)\index{Context-Free Grammar}。上下文无关文法的具体形式如下:
%-------------------------------------------
\begin{definition} 上下文无关文法
......@@ -991,7 +992,7 @@ S=\{\textrm{IP}\} \nonumber
\parinterval 上面这个文法蕴含了不同``层次''的句法信息。比如,规则$r_1$$r_2$$r_3$$r_4$表达了词性对单词的抽象;规则$r_6$$r_7$$r_8$是表达了短语结构的抽象,其中,规则$r_8$描述了汉语中名词短语(主语)+动词短语(谓语)的结构。在实际应用中,像$r_8$这样的规则可以覆盖很大的片段(试想一下一个包含50个词的主谓结构的句子,可以使用$r_8$进行描述)。
\parinterval 上下文无关文法的规则是一种{\small\sffamily\bfseries{产生式规则}}(Production Rule),形如$\alpha \to \beta $,它表示把规则左端的非终结符$\alpha$替换为规则右端的符号序列$\beta$。 通常,$\alpha$被称作规则的左部(Left-hand Side),$\beta$被称作规则的右部(Right-hand Side)。使用右部$\beta$替换左部$\alpha$的过程也被称作规则的使用,而这个过程的逆过程称为规约。规则的使用可以如下定义:
\parinterval 上下文无关文法的规则是一种{\small\sffamily\bfseries{产生式规则}}\index{产生式规则}(Production Rule)\index{Production Rule},形如$\alpha \to \beta $,它表示把规则左端的非终结符$\alpha$替换为规则右端的符号序列$\beta$。 通常,$\alpha$被称作规则的左部(Left-hand Side),$\beta$被称作规则的右部(Right-hand Side)。使用右部$\beta$替换左部$\alpha$的过程也被称作规则的使用,而这个过程的逆过程称为规约。规则的使用可以如下定义:
\vspace{0.5em}
%-------------------------------------------
......@@ -1003,7 +1004,7 @@ S=\{\textrm{IP}\} \nonumber
\end{center}
\end{definition}
\parinterval 给定起始非终结符,可以不断地使用规则,最终生成一个终结符串,这个过程也被称为{\small\bfnew{推导}}(Derivation)。形式化的定义为:
\parinterval 给定起始非终结符,可以不断地使用规则,最终生成一个终结符串,这个过程也被称为{\small\bfnew{推导}}\index{推导}(Derivation)\index{Derivation}。形式化的定义为:
\vspace{0.5em}
%-------------------------------------------
......@@ -1042,9 +1043,9 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\end{figure}
%-------------------------------------------
\parinterval 通常,可以把推导简记为$d=r_1 \circ r_2 \circ ... \circ r_n$,其中$ \circ $表示规则的组合。显然,$d$也对应了树形结构,也就是句法分析结果。从这个角度看,推导就是描述句法分析树的一种方式。此外,规则的推导也把规则的使用过程与生成的字符串对应起来。一个推导所生成的字符串,也被称作文法所产生的一个{\small\bfnew{句子}}(Sentence)。而一个文法所能生成的所有句子是这个文法所对应的{\small\bfnew{语言}}(Language)
\parinterval 通常,可以把推导简记为$d=r_1 \circ r_2 \circ ... \circ r_n$,其中$ \circ $表示规则的组合。显然,$d$也对应了树形结构,也就是句法分析结果。从这个角度看,推导就是描述句法分析树的一种方式。此外,规则的推导也把规则的使用过程与生成的字符串对应起来。一个推导所生成的字符串,也被称作文法所产生的一个{\small\bfnew{句子}}\index{句子}(Sentence)\index{Sentence}。而一个文法所能生成的所有句子是这个文法所对应的{\small\bfnew{语言}}\index{语言}(Language)\index{Language}
\parinterval 但是,句子和规则的推导并不是一一对应的。同一个句子,往往有很多推导的方式,这种现象被称为{\small\bfnew{歧义}}(Ambiguity)。甚至同一棵句法树,也可以对应不同的推导。图\ref{fig:2.5-5} 给出同一棵句法树所对应的两种不同的规则推导。
\parinterval 但是,句子和规则的推导并不是一一对应的。同一个句子,往往有很多推导的方式,这种现象被称为{\small\bfnew{歧义}}\index{歧义}(Ambiguity)\index{Ambiguity}。甚至同一棵句法树,也可以对应不同的推导。图\ref{fig:2.5-5} 给出同一棵句法树所对应的两种不同的规则推导。
%-------------------------------------------
%图2.5.2.4
......@@ -1057,7 +1058,7 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
\end{figure}
%-------------------------------------------
\parinterval 显然,规则顺序的不同会导致句法树的推导这一确定的过程变得不确定。因此,需要进行{\small\bfnew{消歧}}(Disambiguation)。这里,可以使用启发式方法:要求规则使用都服从最左优先原则,这样得到的推导被称为{\small\bfnew{最左优先推导}}(Left-most Derivation)。图\ref{fig:2.5-5}中的推导1 就是符合最左优先原则的推导。
\parinterval 显然,规则顺序的不同会导致句法树的推导这一确定的过程变得不确定。因此,需要进行{\small\bfnew{消歧}}\index{消歧}(Disambiguation)\index{Disambiguation}。这里,可以使用启发式方法:要求规则使用都服从最左优先原则,这样得到的推导被称为{\small\bfnew{最左优先推导}}\index{最左优先推导}(Left-most Derivation)\index{Left-most Derivation}。图\ref{fig:2.5-5}中的推导1 就是符合最左优先原则的推导。
\parinterval 这样,对于一个上下文无关文法,每一棵句法树都有唯一的最左推导与之对应。于是,句法分析可以被描述为:对于一个句子找到能够生成它的最佳推导,这个推导所对应的句法树就是这个句子的句法分析结果。
......@@ -1085,9 +1086,9 @@ s_0 \overset{r_1}{\Rightarrow} s_1 \overset{r_2}{\Rightarrow} s_2 \overset{r_3}{
%-------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{规则和推导的概率}\index{Chapter2.5.3}
\subsection{规则和推导的概率}
\parinterval 对句法树进行概率化,首先要对使用的规则进行概率化。为了达到这个目的,可以使用{\small\bfnew{概率上下文无关文法}}(Probabilistic Context-Free Grammar),它是上下文无关文法的一种扩展。
\parinterval 对句法树进行概率化,首先要对使用的规则进行概率化。为了达到这个目的,可以使用{\small\bfnew{概率上下文无关文法}}\index{概率上下文无关文法}(Probabilistic Context-Free Grammar)\index{Probabilistic Context-Free Grammar},它是上下文无关文法的一种扩展。
\vspace{0.5em}
%-------------------------------------------
\begin{definition} 概率上下文无关文法
......@@ -1131,7 +1132,7 @@ r_6: & & \textrm{VP} \to \textrm{VV}\ \textrm{NN} \nonumber
\parinterval 这也对应了词串``吃\ 鱼''的生成过程。首先,从起始非终结符VP开始,使用规则$r_6$生成两个非终结符VV和NN;进一步,分别使用规则$r_3$$r_4$从VV和NN进一步生成单词``吃''和``鱼''。整个过程的概率等于三条规则概率的乘积。
\parinterval 新的问题又来了,如何得到规则的概率呢?这里仍然可以从数据中学习文法规则的概率。假设有人工标注的数据,它包括很多人工标注句法树的句法,称之为{\small\bfnew{树库}}(Treebank)。然后,对于规则$\textrm{r}:\alpha \to \beta$可以使用极大似然估计:
\parinterval 新的问题又来了,如何得到规则的概率呢?这里仍然可以从数据中学习文法规则的概率。假设有人工标注的数据,它包括很多人工标注句法树的句法,称之为{\small\bfnew{树库}}\index{树库}(Treebank)\index{Treebank}。然后,对于规则$\textrm{r}:\alpha \to \beta$可以使用极大似然估计:
\begin{eqnarray}
\textrm{P}(r) = \frac{\text{规则$r$在树库中出现的次数}}{\alpha \text{在树库中出现的次数}}
......@@ -1170,7 +1171,8 @@ r_6: & & \textrm{VP} \to \textrm{VV}\ \textrm{NN} \nonumber
%-------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{小结及深入阅读} \label{sec2:summary}\index{Chapter2.6}
\sectionnewpage
\section{小结及深入阅读} \label{sec2:summary}
\parinterval 本章重点介绍了如何对自然语言处理问题进行统计建模,并从数据中自动学习统计模型的参数,最终使用学习到的模型对新的问题进行处理。之后,本章将这种思想应用到三个自然语言处理任务中,包括:中文分词、语言建模、句法分析,它们也和机器翻译有着紧密的联系。通过系统化的建模,可以发现:经过适当的假设和化简,统计模型可以很好的描述复杂的自然语言处理问题。相关概念和方法也会在后续章节的内容中被广泛使用。
......@@ -1178,7 +1180,7 @@ r_6: & & \textrm{VP} \to \textrm{VV}\ \textrm{NN} \nonumber
\begin{adjustwidth}{1em}{}
\begin{itemize}
\item 在建模方面,本章介绍的三个任务均采用的是基于人工先验知识进行模型设计的思路。也就是,问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想,它把要解决的问题看作一些观测结果的隐含变量(比如,句子是观测结果,分词结果是隐含在背后的变量),之后通过对隐含变量生成观测结果的过程进行建模,以达到对问题进行数学描述的目的。这类模型一般需要依赖一些独立性假设,假设的合理性对最终的性能有较大影响。相对于{\small\sffamily\bfseries{生成模型}}(Generative Model),另一类方法{\small\sffamily\bfseries{判别模型}}(Discriminative Model),它直接描述了从隐含变量生成观测结果的过程,这样对问题的建模更加直接,同时这类模型可以更加灵活的引入不同的特征。判别模型在自然语言处理中也有广泛应用\cite{shannon1948mathematical}\cite{ng2002discriminative}。 在本书的第四章也会使用到判别式模型。
\item 在建模方面,本章介绍的三个任务均采用的是基于人工先验知识进行模型设计的思路。也就是,问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想,它把要解决的问题看作一些观测结果的隐含变量(比如,句子是观测结果,分词结果是隐含在背后的变量),之后通过对隐含变量生成观测结果的过程进行建模,以达到对问题进行数学描述的目的。这类模型一般需要依赖一些独立性假设,假设的合理性对最终的性能有较大影响。相对于{\small\sffamily\bfseries{生成模型}}\index{生成模型}(Generative Model)\index{Generative Model},另一类方法{\small\sffamily\bfseries{判别模型}}\index{判别模型}(Discriminative Model)\index{Discriminative Model},它直接描述了从隐含变量生成观测结果的过程,这样对问题的建模更加直接,同时这类模型可以更加灵活的引入不同的特征。判别模型在自然语言处理中也有广泛应用\cite{shannon1948mathematical}\cite{ng2002discriminative}。 在本书的第四章也会使用到判别式模型。
\item 从现在自然语言处理的前沿看,基于端到端学习的深度学习方法在很多任务中都取得了领先的性能。但是,本章并没有涉及深度学习及相关方法,这是由于笔者认为:对问题的建模是自然语言处理的基础,对问题的本质刻画并不会因为方法的改变而改变。因此,本章的内容没有太多的陷入到更加复杂的模型和算法设计中,相反,我们希望关注对基本问题的理解和描述。不过,一些前沿方法仍可以作为参考,包括:基于条件随机场和双向长短时记忆模型的序列标注模型\cite{lafferty2001conditional}\cite{huang2015bidirectional}\cite{ma2016end}、神经语言模型\cite{bengio2003neural}\cite{mikolov2010recurrent}、神经句法分析模型\cite{chen2014fast}\cite{zhu2015long}
......
......@@ -8,18 +8,18 @@
\renewcommand\tablename{}%将figure改为图
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\definecolor{ugreen}{rgb}{0,0.5,0}
\chapterimage{chapter_head_1} % Chapter heading image
\chapterimage{fig-NEU-4.jpg} % Chapter heading image
%公式1.7之后往后串一个
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\chapter{基于词的机器翻译模型}
\parinterval 使用概率化的方法对翻译问题进行建模是机器翻译发展中的重要里程碑。这种思想也影响了当今的统计机器翻译和神经机器翻译方法。虽然技术不断发展,传统的统计模型已经不再``新鲜'',但它对于今天机器翻译的研究仍然有着重要的启示作用。在了解前沿、展望未来的同时,我们更要冷静的思考前人给我们带来了什么。基于此,本章将介绍统计机器翻译的开山之作\ \dash \ IBM模型,它提出了使用统计模型进行翻译的思想,并在建模中引入了单词对齐这一重要概念。IBM模型由Peter E. Brown等人于上世纪九十年代初提出\cite{brown1993mathematics}。客观的说,这项工作的视野和对问题的理解,已经超过当时很多人所能看到的东西,其衍生出来的一系列方法和新的问题还被后人花费将近10年的时间来进行研究与讨论。时至今日,IBM模型中的一些思想仍然影响着很多研究工作。
\parinterval 使用概率化的方法对翻译问题进行建模是机器翻译发展中的重要里程碑。这种思想也影响了当今的统计机器翻译和神经机器翻译方法。虽然技术不断发展,传统的统计模型已经不再``新鲜'',但它对于今天机器翻译的研究仍然有着重要的启示作用。在了解前沿、展望未来的同时,我们更要冷静的思考前人给我们带来了什么。基于此,本章将介绍统计机器翻译的开山之作\ \dash \ IBM模型,它提出了使用统计模型进行翻译的思想,并在建模中引入了单词对齐这一重要概念。IBM模型由Peter E. Brown等人于上世纪九十年代初提出\cite{Peter1993The}。客观的说,这项工作的视野和对问题的理解,已经超过当时很多人所能看到的东西,其衍生出来的一系列方法和新的问题还被后人花费将近10年的时间来进行研究与讨论。时至今日,IBM模型中的一些思想仍然影响着很多研究工作。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{什么是基于词的翻译模型}\index{Chapter3.1}%Index的作用,目前不清晰
\section{什么是基于词的翻译模型}
\parinterval 在机器翻译中,我们希望得到一个源语言到目标语言的翻译。对于人类来说这个问题很简单,但是让计算机做这样的工作却很困难,因为我们需要把翻译``描述''成计算机可以计算的形式。这里面临的第一个问题是:如何对翻译进行建模?从计算机的角度来看,这就需要把自然语言的翻译问题转换为计算机可计算的问题。
\parinterval 那么,基于单词的统计机器翻译模型又是如何描述翻译问题的呢?Peter E. Brown等人提出了一个观点\cite{brown1993mathematics}:在一个句子时,可以把其中的每个单词翻译成对应的目标语言单词,然后调整这些目标语言单词的顺序,最后得到整个句子的翻译结果,而这个过程可以用统计模型来描述。尽管在人看来使用两个语言单词之间的对应进行翻译是很自然的事,但是对于计算机来说可是向前迈出了一大步。
\parinterval 那么,基于单词的统计机器翻译模型又是如何描述翻译问题的呢?Peter E. Brown等人提出了一个观点\cite{Peter1993The}:在一个句子时,可以把其中的每个单词翻译成对应的目标语言单词,然后调整这些目标语言单词的顺序,最后得到整个句子的翻译结果,而这个过程可以用统计模型来描述。尽管在人看来使用两个语言单词之间的对应进行翻译是很自然的事,但是对于计算机来说可是向前迈出了一大步。
\parinterval 先来看一个例子。图 \ref{fig:3-1}展示了一个汉语翻译到英语的例子。首先,可以把源语句的单词``我''、``对''、``你''、``感到''和``满意''分别翻译为``I''、``with''、``you''、``am''\ 和``satisfied'',然后调整单词的顺序,比如,``am''放在译文的第2个位置,``you''应该放在最后的位置等等,最后得到译文``I am satisfied with you''。
......@@ -34,7 +34,7 @@
\end{figure}
%-------------------------------------------
\parinterval 上面的例子反映了人在做翻译时所使用的一些知识:首先,两种语言单词的顺序可能不一致,而且译文需要符合目标语的习惯,这也就是常说翻译的{\small\sffamily\bfseries{流畅度}}问题(Fluency);其次,源语言单词需要准确的被翻译出来\footnote{当然,对于一些意译的情况或者虚词并不需要翻译。},也就是常说的翻译的{\small\sffamily\bfseries{准确性}}(Accuracy)和{\small\sffamily\bfseries{充分性}}问题(Adequacy)。为了达到以上目的,传统观点认为翻译过程需要包含三个步骤(图 \ref{fig:3-2}
\parinterval 上面的例子反映了人在做翻译时所使用的一些知识:首先,两种语言单词的顺序可能不一致,而且译文需要符合目标语的习惯,这也就是常说的翻译的{\small\sffamily\bfseries{流畅度}}\index{流畅度}问题(Fluency)\index{Fluency};其次,源语言单词需要准确的被翻译出来\footnote{当然,对于一些意译的情况或者虚词并不需要翻译。},也就是常说的翻译的{\small\sffamily\bfseries{准确性}}问题\index{准确性}(Accuracy)\index{Accuracy}{\small\sffamily\bfseries{充分性}}\index{充分性}问题(Adequacy)\index{Adequacy}。为了达到以上目的,传统观点认为翻译过程需要包含三个步骤(图 \ref{fig:3-2}
\begin{itemize}
\item {\small\sffamily\bfseries{分析:}}将源语言句子切分或者表示为能够处理的最小单元。在基于词的翻译模型中,最小的处理单元就是单词,因此在这里也可以简单地将分析理解为分词\footnote{在后续章节中会看到,分析也包括对句子深层次结构的生成,但是这里为了突出基于单词的概念,因此把问题简化为最简单的情况。}
......@@ -55,14 +55,15 @@
%---------------------------
\parinterval 对于今天的自然语言处理研究,``分析、转换和生成''依然是一个非常深刻的观点。包括机器翻译在内的很多自然语言处理问题都可以用这个过程来解释。比如,对于现在比较前沿的神经机器翻译方法,从大的框架来说,依然在做分析(编码器)、转换(编码-解码注意力)和生成(解码器),只不过这些过程隐含在神经网络的设计中。当然,这里并不会对``分析、转换和生成''的架构展开过多的讨论,随着后面技术内容讨论的深入,这个观念会有进一步体现。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{构建一个简单的机器翻译系统}\index{Chapter3.2}%Index的作用,目前不清晰
\sectionnewpage
\section{构建一个简单的机器翻译系统}
\label{sec:simple-mt-example}
\parinterval 本节首先对比人工翻译和机器翻译过程的异同点,从中归纳出构建机器翻译系统的两个主要步骤:训练和解码。之后,会从学习翻译知识和运用翻译知识两个方面描述如何构建一个简单的机器翻译系统。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{如何进行翻译?}\index{Chapter3.2.1}
\subsection{如何进行翻译?}
\subsubsection*{人工翻译流程}\index{Chapter3.2.1.1}
\subsubsection*{人工翻译流程}
\parinterval 当人翻译一个句子时,首先会快速地分析出句子的(单词)构成,然后根据以往的知识,得到每个词可能的翻译,最后利用对目标语的理解拼出来一个译文。尽管这个过程并不是严格来自心理学或者脑科学的相关结论,但至少可以帮助我们理解人在翻译时的思考方式。
%----------------------------------------------
......@@ -79,7 +80,7 @@
\begin{itemize}
\vspace{0.5em}
\item {\small\bfnew{翻译知识的学习}}:对于输入的源语言句子,首先需要知道每个单词可能的翻译有什么,这些翻译被称为{\small\sffamily\bfseries{翻译候选}}(Translation Candidate)。比如,汉语单词``对''可能的译文有``to''、``with''和``for''等。对于人来说,可以通过阅读、背诵、做题或者老师教等途径获得翻译知识,这些知识就包含了源语言与目标语言单词之间的对应关系。通常,也把这个过程称之为学习过程。
\item {\small\bfnew{翻译知识的学习}}:对于输入的源语言句子,首先需要知道每个单词可能的翻译有什么,这些翻译被称为{\small\sffamily\bfseries{翻译候选}}\index{翻译候选}(Translation Candidate)\index{Translation Candidate}。比如,汉语单词``对''可能的译文有``to''、``with''和``for''等。对于人来说,可以通过阅读、背诵、做题或者老师教等途径获得翻译知识,这些知识就包含了源语言与目标语言单词之间的对应关系。通常,也把这个过程称之为学习过程。
\vspace{0.5em}
\item {\small\bfnew{运用知识生成译文}}:当翻译一个从未见过的句子时,可以运用学习到的翻译知识,得到新的句子中每个单词的译文,并处理常见的单词搭配、主谓一致等问题,比如,我们知道``satisfied''后面常常使用介词``with''构成搭配,基于这些知识可以快速生成译文。
......@@ -89,7 +90,7 @@
当然,每个人进行翻译时所使用的方法和技巧都不相同,所谓人工翻译也没有固定的流程。但是,可以确定的是,人在进行翻译时也需要``学习''和``运用''翻译知识。对翻译知识``学习''和``运用''的好与坏,直接决定了人工翻译结果的质量。
%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{机器翻译流程}\index{Chapter3.2.1.2}
\subsubsection{机器翻译流程}
\parinterval 人进行翻译的过程比较容易理解,那计算机是如何完成翻译的呢?虽然人工智能这个概念显得很神奇,但是计算机远没有人那么智能,有时甚至还很``笨''。一方面,它没有能力像人一样,在教室里和老师一起学习语言知识;另一方面,即使能列举出每个单词的候选译文,但是还是不知道这些译文怎么拼装成句的,甚至不知道哪些译文是对的。为了更加直观地理解机器在翻译时要解决的挑战,可以将问题归纳如下:
......@@ -114,11 +115,11 @@
\parinterval 对于第二个问题,尽管机器能够找到很多译文选择路径,但它并不知道哪些路径是好的。说的再直白一些,简单的枚举路径实际上就是一个体力活,没有什么智能。因此计算机还需要再聪明一些,运用它的能够``掌握''的知识判断翻译结果的好与坏。这一步是最具挑战的,当然也有很多思路。在统计机器翻译中,这个问题被定义为:设计一种统计模型,它可以给每个译文一个可能性,而这个可能性越高表明译文越接近人工翻译。如图\ref{fig:3-4}所示,每个单词翻译候选的右侧黑色框里的数字就是单词的翻译概率,使用这些单词的翻译概率,可以得到整句译文的概率(用符号P表示)。这样,就用概率化的模型描述了每个翻译候选的可能性。基于这些翻译候选的可能性,机器翻译系统可以对所有的翻译路径进行打分,比如,图\ref{fig:3-4}中第一条路径的分数为0.042,第二条是0.006,以此类推。最后,系统可以选择分数最高的路径作为源语言句子的最终译文。
\vspace{-0.5em}
\subsubsection{人工翻译 vs. 机器翻译}\index{Chapter3.2.1.3}
\parinterval 人在翻译时的决策是非常确定并且快速的,但计算机处理这个问题时却充满了概率化的思想。当然它们也有类似的地方。首先,计算机使用统计模型的目的是把翻译知识变得可计算,并把这些``知识''储存在模型参数中,这个模型和人类大脑的作用是类似的\footnote{这里并不是要把统计模型等同于生物学或者认知科学上的人脑,这里是指它们处理翻译问题时发挥的作用类似。};其次,计算机对统计模型进行训练相当于人类对知识的学习,二者都可以被看作是理解、加工知识的过程;再有,计算机使用学习到的模型对新句子进行翻译的过程相当于人运用知识的过程。在统计机器翻译中,模型学习的过程被称为{\small\sffamily\bfseries{训练}}(Training),目的是从双语平行数据中自动学习翻译``知识'';而使用模型处理新句子的过程被称为{\small\sffamily\bfseries{解码}}(Decoding)或{\small\sffamily\bfseries{推断}}(Inference)。图\ref{fig:3-4}的右侧标注在翻译过程中训练和解码的作用。最终,统计机器翻译的核心由三部分构成\ \dash \ 建模、训练和解码。本章后续内容会围绕这三个问题展开讨论。
\subsubsection{人工翻译 vs. 机器翻译}
\parinterval 人在翻译时的决策是非常确定并且快速的,但计算机处理这个问题时却充满了概率化的思想。当然它们也有类似的地方。首先,计算机使用统计模型的目的是把翻译知识变得可计算,并把这些``知识''储存在模型参数中,这个模型和人类大脑的作用是类似的\footnote{这里并不是要把统计模型等同于生物学或者认知科学上的人脑,这里是指它们处理翻译问题时发挥的作用类似。};其次,计算机对统计模型进行训练相当于人类对知识的学习,二者都可以被看作是理解、加工知识的过程;再有,计算机使用学习到的模型对新句子进行翻译的过程相当于人运用知识的过程。在统计机器翻译中,模型学习的过程被称为{\small\sffamily\bfseries{训练}}\index{训练}(Training)\index{Training},目的是从双语平行数据中自动学习翻译``知识'';而使用模型处理新句子的过程被称为{\small\sffamily\bfseries{解码}}\index{解码}(Decoding)\index{Decoding}{\small\sffamily\bfseries{推断}}\index{推断}(Inference)\index{Inference}。图\ref{fig:3-4}的右侧标注在翻译过程中训练和解码的作用。最终,统计机器翻译的核心由三部分构成\ \dash \ 建模、训练和解码。本章后续内容会围绕这三个问题展开讨论。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{基本框架}\index{Chapter3.2.2}
\subsection{基本框架}
\parinterval 为了对统计机器翻译有一个直观的认识,下面将介绍如何构建一个非常简单的基于单词的统计机器翻译系统,其中涉及到的很多思想来自IBM模型。这里,仍然使用数据驱动的统计建模方法。图\ref{fig:3-5}展示了系统的主要流程,包括两个步骤:
......@@ -140,9 +141,9 @@
\parinterval 接下来,本节将介绍统计机器翻译模型训练和解码的方法。在模型学习中,会分两小节进行描述\ \dash \ 单词级翻译和句子级翻译。实现单词级翻译是实现句子级翻译的基础。换言之,句子级翻译的统计模型是建立在单词翻译之上的。在解码中,本节将介绍一个高效的搜索算法,其中也使用到了剪枝和启发式搜索的思想。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{单词翻译概率}\index{Chapter3.2.3}\label{chapter3.2.3}
\subsection{单词翻译概率}\label{chapter3.2.3}
\subsubsection{什么是单词翻译概率?}\index{Chapter3.2.3.1}
\subsubsection{什么是单词翻译概率?}
\parinterval 单词翻译概率描述的是一个源语言单词与目标语言译文构成正确翻译的可能性,这个概率越高表明单词翻译越可靠。使用单词翻译概率,可以帮助机器翻译系统解决翻译时的``择词''问题,即选择什么样的目标语译文是合适的。当人在翻译某个单词时,可以利用积累的知识,快速得到它的高质量候选译文。以汉译英为例,当翻译``我''这个单词时,可能直接会想到用``I''、``me''或``I’m''作为它的译文,而几乎不会选择``you''、``satisfied''等含义相差太远的译文。这是为什么呢?如果从统计学的角度来看,无论是何种语料,包括教材、新闻、小说等,绝大部分情况下``我''都翻译成了``I''、``me''等,几乎不会看到我被翻译成``you''或``satisfied''的情况。可以说``我''翻译成``I''、``me''等属于高频事件,而翻译成``you''、``satisfied''等属于低频或小概率事件。因此人在翻译时也是选择在统计意义上概率更大的译文,这也间接反映出统计模型可以在一定程度上描述人的翻译习惯和模式。
......@@ -165,7 +166,7 @@
\end{table}
%---------------------------
\vspace{-0.5em}
\subsubsection{如何从一个双语平行数据中学习?}\index{Chapter3.2.3.2}
\subsubsection{如何从一个双语平行数据中学习?}
\parinterval 假设有一定数量的双语对照的平行数据,是否可以从中自动获得两种语言单词之间的翻译概率呢?回忆一下第二章中的掷骰子游戏,其中使用了相对频度估计方法来自动获得骰子不同面出现概率的估计值。其中,重复投掷骰子很多次,然后统计``1''到``6''各面出现的次数,再除以投掷的总次数,最后得到它们出现的概率的极大似然估计。这里,可以使用类似的方式计算单词翻译概率。但是,现在有的是句子一级对齐的数据,并不知道两种语言之间单词的对应关系。也就是,要从句子级对齐的平行数据中学习单词之间对齐的概率。这里,需要使用稍微``复杂''一些的模型来描述这个问题。
......@@ -205,7 +206,7 @@
\noindent 注意,由于``look''没有出现在数据中,因此$\textrm{P}(\text{``机器''},\text{``look''}; \mathbf{s},\mathbf{t})=0$。这时,可以使用第二章介绍的平滑算法赋予它一个非零的值,以保证在后续的步骤中整个翻译模型不会出现零概率的情况。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsubsection{如何从大量的双语平行数据中学习?}\index{Chapter3.2.3.3}
\subsubsection{如何从大量的双语平行数据中学习?}
\parinterval 如果有更多的句子,上面的方法同样适用。假设,有$N$个互译句对$(\mathbf{s}^{[1]},\mathbf{t}^{[1]})$,...,\\$(\mathbf{s}^{[N]},\mathbf{t}^{[N]})$。仍然可以使用基于相对频度的方法估计翻译概率$\textrm{P}(x,y)$,具体方法如下:
......@@ -242,7 +243,7 @@
\parinterval 公式\ref{eqC3.6-new}所展示的计算过程很简单,分子是两个句对中``翻译''和``translation''共现次数的累计,分母是两个句对的源语言单词和目标语言单词的组合数的累加。显然,这个方法也很容易推广到处理更多句子的情况。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{句子级翻译模型}\index{Chapter3.2.4}
\subsection{句子级翻译模型}
\label{sec:sentence-level-translation}
\parinterval 下面继续回答如何获取句子级翻译概率的问题。如图\ref{fig:3-6}所示,条件概率$\textrm{P}(\mathbf{t}|\mathbf{s})$表示给出源语言句子$\mathbf{s}$的情况下译文为$\mathbf{t}$的概率。这也是整个句子级翻译模型的核心,一方面需要从数据中学习这个模型的参数,另一方面,对于新输入的句子,需要使用这个模型得到最佳的译文。下面介绍句子级翻译的建模方法。
......@@ -256,7 +257,7 @@
\end{figure}
%---------------------------
\subsubsection{基础模型}\index{Chapter3.2.4.1}
\subsubsection{基础模型}
\parinterval 计算句子级翻译概率并不简单。因为自然语言非常灵活,任何数据无法覆盖足够多的句子,因此,无法像公式\ref{eqC3.5-new}一样直接用简单计数的方式对句子的翻译概率进行估计。这里,采用一个退而求其次的方法:找到一个函数$g(\mathbf{s},\mathbf{t})\ge 0$来模拟翻译概率对译文可能性进行估计。可以定义一个新的函数$g(\mathbf{s},\mathbf{t})$,令其满足:给定$\mathbf{s}$,翻译结果$\mathbf{t}$出现的可能性越大,$g(\mathbf{s},\mathbf{t})$的值越大;$\mathbf{t}$出现的可能性越小,$g(\mathbf{s},\mathbf{t})$的值越小。换句话说,$g(\mathbf{s},\mathbf{t})$的单调性和翻译概率$\textrm{P}(\mathbf{t}|\mathbf{s})$呈正相关。如果存在这样的函数$g(\mathbf{s},\mathbf{t}
)$,可以利用$g(\mathbf{s},\mathbf{t})$近似表示$\textrm{P}(\mathbf{t}|\mathbf{s})$,如下:
......@@ -280,7 +281,7 @@
\parinterval 回到设计$g(\mathbf{s},\mathbf{t})$的问题上。这里,采用``大题小作''的方法,这个技巧在第二章已经进行了充分的介绍。具体来说,直接建模句子之间的对应比较困难,但可以利用单词之间的对应来描述句子之间的对应关系。这就用到了上一小节所介绍的单词翻译概率。
\parinterval 首先引入一个非常重要的概念\ \dash \ {\small\sffamily\bfseries{词对齐}}(Word Alignment),它是统计机器翻译中最核心的概念之一。词对齐描述了平行句对中单词之间的对应关系,它体现了一种观点:本质上句子之间的对应是由单词之间的对应表示的。当然,这个观点在神经机器翻译或者其他模型中可能会有不同的理解,但是翻译句子的过程中考虑词级的对应关系是符合我们对语言的认知的。图\ref{fig:3-7} 展示了一个句对$\mathbf{s}$$\mathbf{t}$,单词的右下标数字表示了该词在句中的位置,而虚线表示的是句子$\mathbf{s}$$\mathbf{t}$中的词对齐关系。比如,``满意''的右下标数字5表示在句子$\mathbf{s}$中处于第5个位置,``satisfied''的右下标数字3表示在句子$\mathbf{t}$中处于第3个位置,``满意''和``satisfied''之间的虚线表示两个单词之间是对齐的。为方便描述,用二元组$(j,i)$ 来描述词对齐,它表示源语言句子的第$j$个单词对应目标语言句子的第$i$个单词,即单词$s_j$$t_i$对应。通常,也会把$(j,i)$称作一条{\small\sffamily\bfseries{词对齐连接}}。图\ref{fig:3-7} 中共有5 条虚线,表示有5组单词之间的词对齐连接。可以把这些词对齐连接构成的集合作为词对齐的一种表示,记为$\mathbf{a}$,即$A={\{(1,1),(2,4),(3,5),(4,2)(5,3)}\}$
\parinterval 首先引入一个非常重要的概念\ \dash \ {\small\sffamily\bfseries{词对齐}}\index{词对齐}(Word Alignment)\index{Word Alignment},它是统计机器翻译中最核心的概念之一。词对齐描述了平行句对中单词之间的对应关系,它体现了一种观点:本质上句子之间的对应是由单词之间的对应表示的。当然,这个观点在神经机器翻译或者其他模型中可能会有不同的理解,但是翻译句子的过程中考虑词级的对应关系是符合我们对语言的认知的。图\ref{fig:3-7} 展示了一个句对$\mathbf{s}$$\mathbf{t}$,单词的右下标数字表示了该词在句中的位置,而虚线表示的是句子$\mathbf{s}$$\mathbf{t}$中的词对齐关系。比如,``满意''的右下标数字5表示在句子$\mathbf{s}$中处于第5个位置,``satisfied''的右下标数字3表示在句子$\mathbf{t}$中处于第3个位置,``满意''和``satisfied''之间的虚线表示两个单词之间是对齐的。为方便描述,用二元组$(j,i)$ 来描述词对齐,它表示源语言句子的第$j$个单词对应目标语言句子的第$i$个单词,即单词$s_j$$t_i$对应。通常,也会把$(j,i)$称作一条{\small\sffamily\bfseries{词对齐连接}}\index{词对齐连接}。图\ref{fig:3-7} 中共有5 条虚线,表示有5组单词之间的词对齐连接。可以把这些词对齐连接构成的集合作为词对齐的一种表示,记为$\mathbf{a}$,即$A={\{(1,1),(2,4),(3,5),(4,2)(5,3)}\}$
%----------------------------------------------
% 图3.11
\begin{figure}[htp]
......@@ -306,7 +307,7 @@ g(\mathbf{s},\mathbf{t}) = \prod_{(j,i)\in \widehat{A}}\textrm{P}(s_j,t_i)
\parinterval 显然,如果每个词对齐连接所对应的翻译概率变大,那么整个句子翻译的得分也会提高。也就是说,词对齐越准确,翻译模型的打分越高,$\mathbf{s}$$\mathbf{t}$之间存在翻译关系的可能性越大。
\subsubsection{生成流畅的译文}\index{Chapter3.2.4.2}
\subsubsection{生成流畅的译文}
\parinterval 公式\ref{eqC3.8-new}定义的$g(\mathbf{s},\mathbf{t})$存在的问题是没有考虑词序信息。这里用一个简单的例子说明这个问题。如图\ref{fig:3-8}所示,源语言句子``我 对 你 感到 满意''有两个翻译结果,第一个翻译结果是``I am satisfied with you'',第二个是``I with you am satisfied''。虽然这两个译文包含的目标语单词是一样的,但词序存在很大差异。比如,它们都选择了``satisfied''作为源语单词``满意''的译文,但是在第一个翻译结果中``satisfied''处于第3个位置,而第二个结果中处于最后的位置。显然第一个翻译结果更符合英文的表达习惯,翻译的质量更高。遗憾的是,对于有明显差异的两个译文,公式\ref{eqC3.8-new}计算得到的函数$g(\cdot)$的值却是一样的。
%----------------------------------------------
......@@ -349,10 +350,10 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{解码}\index{Chapter3.2.5}
\subsection{解码}
\label{sec:simple-decoding}
\parinterval {\small\sffamily\bfseries{解码}}(Decoding)是指在得到翻译模型后,对于新输入的句子生成最佳译文的过程。具体来说,当给定任意的源语言句子$\mathbf{s}$,解码系统要找到翻译概率最大的目标语译文$\hat{\mathbf{t}}$。这个过程可以被形式化描述为:
\parinterval {\small\sffamily\bfseries{解码}}\index{解码}(Decoding)\index{Decoding}是指在得到翻译模型后,对于新输入的句子生成最佳译文的过程。具体来说,当给定任意的源语言句子$\mathbf{s}$,解码系统要找到翻译概率最大的目标语译文$\hat{\mathbf{t}}$。这个过程可以被形式化描述为:
\begin{eqnarray}
\widehat{\mathbf{t}}=\argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})
\label{eqC3.12-new}
......@@ -420,10 +421,11 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{5.5em}%调整布局用
\section{基于词的翻译建模}\index{Chapter3.3}
\sectionnewpage
\section{基于词的翻译建模}
\parinterval\ref{sec:simple-mt-example}节中,我们实现了一个简单的基于词的统计机器翻译模型,内容涉及建模、训练和解码。但是,还有很多问题还没有进行深入讨论,比如,如何处理空翻译?如何对调序问题进行建模?如何用更严密的数学模型描述翻译过程?如何对更加复杂的统计模型进行训练?等等。针对以上问题,本节将系统的介绍IBM统计机器翻译模型。作为经典的机器翻译模型,对IBM模型的学习将帮助我们建立对自然语言处理问题的系统化建模思想,特别是对问题的数学描述方法将会成为理解本书后续内容的基础工具。
\subsection{噪声信道模型}\index{Chapter3.3.1}
\subsection{噪声信道模型}
\parinterval 首先,重新思考一下人类进行翻译的过程。对于给定的源语句$\mathbf{s}$,人不会像计算机一样尝试很多的可能,而是快速准确的翻译出一个或者少数几个正确的译文。在人看来,除了正确的译文外,其他的翻译都是不正确的,或者说除了少数的译文人甚至都不会考虑太多其他的可能性。但是,在统计机器翻译的世界里,没有译文是不可能的。换句话说,对于源语言句子$\mathbf{s}$,所有目标语词串$\mathbf{t}$都是可能的译文,只是可能性大小不同。即每对$(\mathbf{s},\mathbf{t})$都有一个概率值$\textrm{P}(\mathbf{t}|\mathbf{s})$来描述$\mathbf{s}$翻译为$\mathbf{t}$的好与坏(图\ref{fig:3-12})。
%----------------------------------------------
......@@ -436,7 +438,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\end{figure}
%---------------------------
\parinterval IBM模型也是建立在如上统计模型之上。具体来说,IBM模型的基础是{\small\sffamily\bfseries{噪声信道模型}}(Noise Channel Model),它是由Shannon在上世纪40年代末提出来的\cite{shannon1949communication},并于上世纪80年代应用在语言识别领域,后来又被Brown等人用于统计机器翻译中\cite{brown1990statistical}
\parinterval IBM模型也是建立在如上统计模型之上。具体来说,IBM模型的基础是{\small\sffamily\bfseries{噪声信道模型}}\index{噪声信道模型}(Noise Channel Model)\index{Noise Channel Model},它是由Shannon在上世纪40年代末提出来的\cite{shannon1949communication},并于上世纪80年代应用在语言识别领域,后来又被Brown等人用于统计机器翻译中\cite{brown1990statistical}
\parinterval 在噪声信道模型中,源语言句子$\mathbf{s}$(信宿)被看作是由目标语言句子$\mathbf{t}$(信源)经过一个有噪声的信道得到的。如果知道了$\mathbf{s}$和信道的性质,可以通过$\textrm{P}(\mathbf{t}|\mathbf{s})$得到信源的信息,这个过程如图\ref{fig:3-13}所示。
......@@ -484,7 +486,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
实际上,在机器翻译中引入语言模型是一个很深刻的概念。在IBM模型之后相当长的时间里,语言模型一直是机器翻译各个部件中最重要的部分。即使现在机器翻译模型已经更新换代,对译文连贯性的建模也是所有系统中需要包含的内容(即使隐形体现)。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{统计机器翻译的三个基本问题}\index{Chapter3.3.2}
\subsection{统计机器翻译的三个基本问题}
\parinterval 公式\ref{eqC3.17-new}给出了统计机器翻译的数学描述。为了实现这个过程,面临着三个基本问题:
......@@ -511,13 +513,13 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\parinterval$g(\mathbf{s},\mathbf{t})$函数的建模很粗糙,因此下面将介绍IBM模型对问题更严谨的定义与建模。对于语言模型$\textrm{P}(\mathbf{t})$和解码过程在前面的内容中都有介绍,所以本章的后半部分会重点介绍如何定义翻译模型$\textrm{P}(\mathbf{s}|\mathbf{t})$以及如何训练模型参数。
\subsubsection{词对齐}\index{Chapter3.3.2.1}
\subsubsection{词对齐}
\parinterval IBM模型的一个基本的假设是词对齐假设。{\small\sffamily\bfseries{词对齐}}(Word Alignment)描述了源语言句子和目标语句子之间单词级别的对应。具体来说,给定源语句子$\mathbf{s}=s_1...s_m$和目标语译文$\mathbf{t}=t_1...t_l$,IBM模型假设词对齐具有如下两个性质。
\parinterval IBM模型的一个基本的假设是词对齐假设。{\small\sffamily\bfseries{词对齐}}\index{词对齐}(Word Alignment)\index{Word Alignment}描述了源语言句子和目标语句子之间单词级别的对应。具体来说,给定源语句子$\mathbf{s}=s_1...s_m$和目标语译文$\mathbf{t}=t_1...t_l$,IBM模型假设词对齐具有如下两个性质。
\begin{itemize}
\vspace{0.5em}
\item 一个源语言单词只能对应一个目标语单词。在图\ref{fig:3-15}表示的例子中,(a)和(c)都满足该条件,尽管(c)中的``谢谢''和``你''都对应``thanks'',但并不违背这个约束。而(b)不满足约束,因为``谢谢''同时对应到了两个目标语单词上。这个约束条件也导致这里的词对齐变成一种{\small\sffamily\bfseries{非对称的词对齐}}(Asymmetric Word Alignment),因为它只对源语言做了约束,但是目标语言没有。使用这样的约束的目的是为了减少建模的复杂度。在IBM模型之后的方法中也提出了双向词对齐,用于建模一个源语言单词对应到多个目标语单词的情况。
\item 一个源语言单词只能对应一个目标语单词。在图\ref{fig:3-15}表示的例子中,(a)和(c)都满足该条件,尽管(c)中的``谢谢''和``你''都对应``thanks'',但并不违背这个约束。而(b)不满足约束,因为``谢谢''同时对应到了两个目标语单词上。这个约束条件也导致这里的词对齐变成一种{\small\sffamily\bfseries{非对称的词对齐}}\index{非对称的词对齐}(Asymmetric Word Alignment)\index{Asymmetric Word Alignment},因为它只对源语言做了约束,但是目标语言没有。使用这样的约束的目的是为了减少建模的复杂度。在IBM模型之后的方法中也提出了双向词对齐,用于建模一个源语言单词对应到多个目标语单词的情况。
%----------------------------------------------
% 图3.21
\begin{figure}[htp]
......@@ -528,7 +530,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\end{figure}
%---------------------------
\vspace{0.5em}
\item 源语言单词可以翻译为空,这时它对应到一个虚拟或伪造的目标语单词$t_0$。在图\ref{fig:3-16}所示的例子中,``在''没有对应到``on the table''中的任意一个词,而是把它对应到$t_0$上。这样,所有的源语言单词都能找到一个目标语单词对应。这种设计也很好地引入了{\small\sffamily\bfseries{空对齐}}的思想,即源语言单词不对应任何真实存在的单词的情况。而这种空对齐的情况在翻译中是频繁出现的,比如虚词的翻译。
\item 源语言单词可以翻译为空,这时它对应到一个虚拟或伪造的目标语单词$t_0$。在图\ref{fig:3-16}所示的例子中,``在''没有对应到``on the table''中的任意一个词,而是把它对应到$t_0$上。这样,所有的源语言单词都能找到一个目标语单词对应。这种设计也很好地引入了{\small\sffamily\bfseries{空对齐}}\index{空对齐}的思想,即源语言单词不对应任何真实存在的单词的情况。而这种空对齐的情况在翻译中是频繁出现的,比如虚词的翻译。
%----------------------------------------------
% 图3.21
\begin{figure}[htp]
......@@ -542,7 +544,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\parinterval 通常,把词对齐记为$\mathbf{a}$,它由$a_1$$a_m$$m$个词对齐连接组成,即$\mathbf{a}=a_1...a_m$$a_j$表示第$j$个源语单词$s_j$对应的目标语单词的位置。在图\ref{fig:3-16}的例子中,词对齐关系可以记为$a_1=0, a_2=3, a_3=1$,即第1个源语单词``在''对应到目标语译文的第0个位置,第2个源语单词``桌子''对应到目标语译文的第3个位置,第3个源语单词``上''对应到目标语译文的第1个位置。
\subsubsection{基于词对齐的翻译模型}\index{Chapter3.3.2.2}
\subsubsection{基于词对齐的翻译模型}
\parinterval 直接准确估计$\textrm{P}(\mathbf{s}|\mathbf{t})$很难,训练数据只能覆盖整个样本空间非常小的一部分,绝大多数句子在训练数据中一次也没出现过。为了解决这个问题,IBM模型假设:句子之间的对应可以由单词之间的对应进行表示。于是,句子翻译的概率可以被转化为词对齐生成的概率:
......@@ -584,7 +586,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\parinterval 换句话说,当求$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$时,首先根据译文$\mathbf{t}$确定源语言句子$\mathbf{s}$的长度$m$;当知道源语言句子有多少个单词后,循环$m$次,依次生成第1个到第$m$个源语言单词;当生成第$j$个源语言单词时,要先确定它是由哪个目标语译文单词生成的,即确定生成的源语言单词对应的译文单词的位置;当知道了目标语译文单词的位置,就能确定第$j$个位置的源语言单词。
\parinterval 需要注意的是公式\ref{eqC3.19-new}定义的模型并没有做任何化简和假设,也就是说公式的左右两端是严格相等的。在后面的内容中会看到,这种将一个整体进行拆分的方法可以有助于分步骤化简并处理问题。
\subsubsection{基于词对齐的翻译实例}\index{Chapter3.3.2.3}
\subsubsection{基于词对齐的翻译实例}
\parinterval 用前面图\ref{fig:3-16}中例子来对公式\ref{eqC3.19-new}进行说明。例子中,源语言句子``在\ \ 桌子\ \ 上''目标语译文``on the table''之间的词对齐为$\mathbf{a}=\{\textrm{1-0, 2-3, 3-1}\}$。公式\ref{eqC3.19-new}的计算过程如下:
\begin{itemize}
......@@ -609,10 +611,11 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\end{eqnarray}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{IBM模型1-2}\index{Chapter3.4}
\sectionnewpage
\section{IBM模型1-2}
\parinterval 公式\ref{eqC3.18-new}和公式\ref{eqC3.19-new}把翻译问题定义为对译文和词对齐同时进行生成的问题。其中有两个问题:首先,公式\ref{eqC3.18-new}的右端($ \sum_{\mathbf{a}}\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$)要求对所有的词对齐概率进行求和,但是词对齐的数量随着句子长度是呈指数增长,如何遍历所有的对齐$\mathbf{a}$?其次,公式\ref{eqC3.19-new}虽然对词对齐的问题进行了描述,但是模型中的很多参数仍然很复杂,如何计算$\textrm{P}(m|\mathbf{t})$$\textrm{P}(a_j|a_1^{j-1},s_1^{j-1},m,\mathbf{t})$$\textrm{P}(s_j|a_1^{j},s_1^{j-1},m,\mathbf{t})$?针对这些问题,Brown等人总共提出了5种解决方案,这也就是被后人所熟知的5个IBM翻译模型。第一个问题可以通过一定的数学或者工程技巧进行求解;第二个问题可以通过一些假设进行化简,依据化简的层次和复杂度不同,可以分为IBM模型1、IBM模型2、IBM模型3、IBM模型4以及IBM模型5。本节首先介绍较为简单的IBM模型1-2。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{IBM模型1}\index{Chapter3.4.1}
\subsection{IBM模型1}
\parinterval IBM模型1对公式\ref{eqC3.19-new}中的三项进行了简化。具体方法如下:
\begin{itemize}
\vspace{0.5em}
......@@ -675,7 +678,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\parinterval 这样就得到了IBM模型1中句子翻译概率的计算式。可以看出IBM模型1的假设把翻译模型化简成了非常简单的形式。对于给定的$\mathbf{s}$$\mathbf{a}$$\mathbf{t}$,只要知道$\varepsilon$$\mathbf{t}(s_j |t_{a_j })$ 就可以计算出$\textrm{P}(\mathbf{s}| \mathbf{t})$,进而求出$\textrm{P}(\mathbf{s}| \mathbf{t})$\\ \\ \\
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{IBM模型2}\index{Chapter3.4.2}
\subsection{IBM模型2}
\parinterval IBM模型1很好地化简了问题,但是由于使用了很强的假设,导致模型和实际情况有较大差异。其中一个比较严重的问题是假设词对齐的生成概率服从均匀分布。图\ref{fig:3-20}展示了一个简单的实例。尽管译文$\mathbf{t}$$\mathbf{t}'$的质量更好,但对于IBM模型1来说它们对应的翻译概率相同。这是因为当词对齐服从均匀分布时,模型会忽略目标语言单词的位置信息。因此当单词翻译相同但顺序不同时,翻译概率一样。同时,由于源语言单词是由错误位置的目标语单词生成的,不合理的对齐也会导致不合理的词汇翻译概率。
......@@ -706,7 +709,7 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\parinterval 类似于模型1,模型2的表达式\ref{eqC3.27-new}也能被拆分为两部分进行理解。第一部分:遍历所有的$\mathbf{a}$;第二部分:对于每个$\mathbf{a}$累加对齐概率$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$,即计算对齐概率$a(a_j|j,m,l)$和词汇翻译概率$f(s_j|t_{a_j})$对于所有源语言位置的乘积。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{解码及计算优化}\index{Chapter3.4.3}
\subsection{解码及计算优化}
\parinterval 如果模型参数给定,可以使用IBM模型1-2对新的句子进行翻译。比如,可以使用\ref{sec:simple-decoding}节描述的解码方法搜索最优译文。在搜索过程中,只需要通过公式\ref{eqC3.25-new}\ref{eqC3.27-new}计算每个译文候选的IBM模型翻译概率。但是,公式\ref{eqC3.25-new}\ref{eqC3.27-new}的高计算复杂度导致这些模型很难直接使用。以IBM模型1为例,这里把公式\ref{eqC3.25-new}重写为:
\begin{eqnarray}
......@@ -742,11 +745,11 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
公式\ref{eq:final-model1}\ref{eq:final-model2}是IBM模型1-2的最终表达式,在解码和训练中可以被直接使用。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{训练}\index{Chapter3.4.4}
\subsection{训练}
\parinterval 在完成了建模和解码的基础上,剩下的问题是如何得到模型的参数。这也是整个统计机器翻译里最重要的内容。下面将会对IBM模型1-2的参数估计方法进行介绍。
\subsubsection{目标函数}\index{Chapter3.4.4.1}
\subsubsection{目标函数}
\parinterval 统计机器翻译模型的训练是一个典型的优化问题。简单来说,训练是指在给定数据集(训练集)上调整参数使得目标函数的值达到最大(或最小),此时得到的参数被称为是该模型在该目标函数下的最优解(图\ref{fig:3-22})。
......@@ -778,9 +781,9 @@ g(\mathbf{s},\mathbf{t}) \equiv \prod_{j,i \in \widehat{A}}{\textrm{P}(s_j,t_i)}
\end{eqnarray}
\noindent 其中,$\textrm{max}(\cdot)$表示最大化,$\frac{\varepsilon}{(l+1)^m}\prod_{j=1}^{m}\sum_{i=0}^{l}{f({s_j|t_i})}$是目标函数,$f({s_j|t_i})$是模型的参数,$\sum_{s_x}{f(s_x|t_y)}=1$是优化的约束条件,以保证翻译概率满足归一化的要求。需要注意的是$\{f(s_x |t_y)\}$对应了很多参数,每个源语言单词和每个目标语单词的组合都对应一个参数$f(s_x |t_y)$
%%%%%%%%%%%%%%%%%%%%%%
\subsubsection {优化}\index{Chapter3.4.4.2}
\subsubsection {优化}
\parinterval 我们已经把IBM模型的参数训练问题定义为带约束的目标函数优化问题。由于目标函数是可微分函数,解决这类问题的一种常用手法是把带约束的优化问题转化为不带约束的优化问题。这里用到了{\small\sffamily\bfseries{拉格朗日乘数法}}(The Lagrange Multiplier Method),它的基本思想是把含有$n$个变量和$m$个约束条件的优化问题转化为含有$n+m$个变量的无约束优化问题。
\parinterval 我们已经把IBM模型的参数训练问题定义为带约束的目标函数优化问题。由于目标函数是可微分函数,解决这类问题的一种常用手法是把带约束的优化问题转化为不带约束的优化问题。这里用到了{\small\sffamily\bfseries{拉格朗日乘数法}}\index{拉格朗日乘数法}(The Lagrange Multiplier Method)\index{The Lagrange Multiplier Method},它的基本思想是把含有$n$个变量和$m$个约束条件的优化问题转化为含有$n+m$个变量的无约束优化问题。
\parinterval 这里的目标是$\max(\textrm{P}_{\theta}(\mathbf{s}|\mathbf{t}))$,约束条件是对于任意的目标语单词$t_y$\\$\sum_{s_x}{\textrm{P}(s_x|t_y)}=1$。根据拉格朗日乘数法,可以把上述优化问题重新定义最大化如下拉格朗日函数:
\begin{eqnarray}
......@@ -850,7 +853,7 @@ f(s_u|t_v) = \lambda_{t_v}^{-1} \frac{\varepsilon}{(l+1)^{m}} \prod\limits_{j=1}
\label{eqC3.40-new}
\end{eqnarray}
\noindent \hspace{2em} 可以看出,这不是一个计算$f(s_u|t_v)$的解析式,因为等式右端仍含有$f(s_u|t_v)$。不过它蕴含着一种非常经典的方法\ $\dash$\ {\small\sffamily\bfseries{期望最大化}}(Expectation Maximization)方法,简称EM方法(或算法)。使用EM方法可以利用上式迭代地计算$f(s_u|t_v)$,使其最终收敛到最优值。EM方法的思想是:用当前的参数,求似然函数的期望,之后最大化这个期望同时得到新的一组参数的值。对于IBM模型来说,其迭代过程就是反复使用公式\ref{eqC3.40-new},具体如图\ref{fig:3-24}所示。
\noindent \hspace{2em} 可以看出,这不是一个计算$f(s_u|t_v)$的解析式,因为等式右端仍含有$f(s_u|t_v)$。不过它蕴含着一种非常经典的方法\ $\dash$\ {\small\sffamily\bfseries{期望最大化}}\index{期望最大化}(Expectation Maximization)\index{Expectation Maximization}方法,简称EM方法(或算法)。使用EM方法可以利用上式迭代地计算$f(s_u|t_v)$,使其最终收敛到最优值。EM方法的思想是:用当前的参数,求似然函数的期望,之后最大化这个期望同时得到新的一组参数的值。对于IBM模型来说,其迭代过程就是反复使用公式\ref{eqC3.40-new},具体如图\ref{fig:3-24}所示。
%----------------------------------------------
% 图3.28
\begin{figure}[htp]
......@@ -873,7 +876,7 @@ f(s_u|t_v) = \lambda_{t_v}^{-1} \frac{\varepsilon}{(l+1)^{m}} \prod\limits_{j=1}
\end{figure}
%---------------------------
\noindent 其中,红色部分表示翻译概率P$(\mathbf{s}|\mathbf{t})$;蓝色部分表示$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中配对的总次数,即``$t_v$翻译为$s_u$''在所有对齐中出现的次数;绿色部分表示$f(s_u|t_v)$对于所有的$t_i$的相对值,即``$t_v$翻译为$s_u$''在所有对齐中出现的相对概率;蓝色与绿色部分相乘表示``$t_v$翻译为$s_u$''这个事件出现次数的期望的估计,称之为{\small\sffamily\bfseries{期望频次}}(Expected Count)
\noindent 其中,红色部分表示翻译概率P$(\mathbf{s}|\mathbf{t})$;蓝色部分表示$(s_u,t_v)$在句对$(\mathbf{s},\mathbf{t})$中配对的总次数,即``$t_v$翻译为$s_u$''在所有对齐中出现的次数;绿色部分表示$f(s_u|t_v)$对于所有的$t_i$的相对值,即``$t_v$翻译为$s_u$''在所有对齐中出现的相对概率;蓝色与绿色部分相乘表示``$t_v$翻译为$s_u$''这个事件出现次数的期望的估计,称之为{\small\sffamily\bfseries{期望频次}}\index{期望频次}(Expected Count)\index{Expected Count}
\noindent \hspace{2em} 期望频次是事件在其分布下出现次数的期望。另$c_{\mathbb{E}}(X)$为事件$X$的期望频次,其计算公式为:
......@@ -973,15 +976,16 @@ a(i|j,m,l) &=\frac{\sum_{k=0}^{K}c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^
\end{eqnarray}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{IBM模型3-5及隐马尔可夫模型}\index{Chapter3.5}
\sectionnewpage
\section{IBM模型3-5及隐马尔可夫模型}
\parinterval 本节在IBM模型1-2的基础上继续介绍IBM模型3-5,这些模型采用了更细致的建模方式来描述翻译问题,包括引入产出率、单词的抽象等重要方法。此外,本节也会介绍隐马尔可夫模型,它和IBM模型有一定联系,但是从另一个视角看待翻译问题。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{基于产出率的翻译模型}\index{Chapter3.5.1}
\subsection{基于产出率的翻译模型}
\parinterval 从前面的介绍可知,IBM模型1和模型2把不同的源语言单词看作相互独立的单元来进行词对齐和翻译。换句话说,即使某个源语言短语中的两个单词都对齐到同一个目标语单词,它们之间也是相互独立的。这样模型1和模型2对于多个源语言单词对齐到同一个目标语单词的情况并不能很好地进行描述。
\parinterval 这里将会给出另一个翻译模型,能在一定程度上解决上面提到的问题。该模型把译文生成源文的过程分解为如下几个步骤:首先,确定每个目标语言单词生成源语言单词的个数,这里把它称为{\small\sffamily\bfseries{产出率}}{\small\sffamily\bfseries{繁衍率}}(Fertility);其次,决定译文中每个单词生成的源语言单词都是什么,即决定生成的第一个源语言单词是什么,生成的第二个源语言单词是什么,以此类推。这样每个目标语单词就对应了一个源语言单词列表;最后把各组源语言单词列表中的每个单词都放置到合适的位置上,完成目标语言译文到源语言句子的生成。
\parinterval 这里将会给出另一个翻译模型,能在一定程度上解决上面提到的问题。该模型把译文生成源文的过程分解为如下几个步骤:首先,确定每个目标语言单词生成源语言单词的个数,这里把它称为{\small\sffamily\bfseries{产出率}}\index{产出率}{\small\sffamily\bfseries{繁衍率}}\index{繁衍率}(Fertility)\index{Fertility};其次,决定译文中每个单词生成的源语言单词都是什么,即决定生成的第一个源语言单词是什么,生成的第二个源语言单词是什么,以此类推。这样每个目标语单词就对应了一个源语言单词列表;最后把各组源语言单词列表中的每个单词都放置到合适的位置上,完成目标语言译文到源语言句子的生成。
%----------------------------------------------
% 图3.5.1
......@@ -1029,15 +1033,15 @@ a(i|j,m,l) &=\frac{\sum_{k=0}^{K}c_{\mathbb{E}}(i|j;\mathbf{s}^{[k]},\mathbf{t}^
\begin{itemize}
\item 对每个$i\in[1,l]$的目标语单词的产出率建模({\color{red} 红色}),即$\varphi_i$的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$\footnote{这里约定,当$i=1$ 时,$\varphi_1^0$ 表示一个空}
\item $i=0$时的产出率建模({\color{ublue} 蓝色}),即空标记$t_0$的产出率的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$
\item 词汇翻译建模({\color{ugreen} 绿色}),目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率,依赖于$\mathbf{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$
\item 对于每个$i\in[1,l]$的目标语言单词生成的源语言单词的{\small\bfnew{扭曲度}}(Distortion)建模({\color{yellow!70!black} 黄色}),即第$i$个译文单词生成的第$k$个源语言单词在源文中的位置$\pi_{ik}$ 的概率。其中$\pi_1^{i-1}$$\pi_{i1}^{k-1}$分别表示区间$[1,i-1]$的目标语言单词生成的源语言单词的扭曲度和第$i$译文单词生成的前$k$个源语言单词的扭曲度。
\item $i=0$时的扭曲度建模({\color{gray} 灰色}),即空标记$t_0$生成的源语言单词在源语言句子中位置的概率。
\item 对每个$i\in[1,l]$的目标语单词的产出率建模({\color{red!70} 红色}),即$\varphi_i$的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$\footnote{这里约定,当$i=1$ 时,$\varphi_1^0$ 表示一个空}
\item $i=0$时的产出率建模({\color{blue!70} 蓝色}),即空标记$t_0$的产出率的概率。它依赖于$\mathbf{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$
\item 词汇翻译建模({\color{green!70} 绿色}),目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率,依赖于$\mathbf{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$
\item 对于每个$i\in[1,l]$的目标语言单词生成的源语言单词的{\small\bfnew{扭曲度}}\index{扭曲度}(Distortion)\index{Distortion}建模({\color{yellow!70!black} 黄色}),即第$i$个译文单词生成的第$k$个源语言单词在源文中的位置$\pi_{ik}$ 的概率。其中$\pi_1^{i-1}$$\pi_{i1}^{k-1}$分别表示区间$[1,i-1]$的目标语言单词生成的源语言单词的扭曲度和第$i$译文单词生成的前$k$个源语言单词的扭曲度。
\item $i=0$时的扭曲度建模({\color{gray!70} 灰色}),即空标记$t_0$生成的源语言单词在源语言句子中位置的概率。
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{IBM 模型3}\index{Chapter3.5.2}
\subsection{IBM 模型3}
\parinterval IBM模型3通过一些假设对图\ref{fig:3-31}所表示的基本模型进行了化简。具体来说,对于每个$i\in[1,l]$,假设$\textrm{P}(\varphi_i |\varphi_1^{i-1},\mathbf{t})$仅依赖于$\varphi_i$$t_i$$\textrm{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_1^{i-1},\tau_0^l,\varphi_0^l,\mathbf{t})$仅依赖于$\pi_{ik}$$i$$m$$l$。而对于所有的$i\in[0,l]$,假设$\textrm{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_1^{i-1},\phi_0^l,\mathbf{t})$仅依赖于$\tau_{ik}$$t_i$。形式化这些假设,可以得到:
\begin{eqnarray}
......@@ -1086,11 +1090,11 @@ p_0+p_1 & = & 1 \label{eqC3.62-new}
\end{eqnarray}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{IBM 模型4}\index{Chapter3.5.3}
\subsection{IBM 模型4}
\parinterval IBM模型3仍然存在问题,比如,它不能很好地处理一个目标语言单词生成多个源语言单词的情况。这个问题在模型1和模型2中也存在。如果一个目标语言单词对应多个源语言单词,往往这些源语言单词构成短语或搭配。但是模型1-3把这些源语言单词看成独立的单元,而实际上它们是一个整体。这就造成了在模型1-3中这些源语言单词可能会``分散''开。为了解决这个问题,模型4对模型3进行了进一步修正。
\parinterval 为了更清楚的阐述,这里引入新的术语\ \dash \ {\small\bfnew{概念单元}}{\small\bfnew{概念}}(Concept)。词对齐可以被看作概念之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法\cite{brown1993mathematics},可以把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是,源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept. 可以为空,因此可以把那些空对的单词看作空cept.。比如,在图\ref{fig:3-32}的实例中,``了''就对应一个空cept.。
\parinterval 为了更清楚的阐述,这里引入新的术语\ \dash \ {\small\bfnew{概念单元}}\index{概念单元}{\small\bfnew{概念}}\index{概念}(Concept)\index{Concept}。词对齐可以被看作概念之间的对应。这里的概念是指具有独立语法或语义功能的一组单词。依照Brown等人的表示方法\cite{Peter1993The},可以把概念记为cept.。每个句子都可以被表示成一系列的cept.。这里要注意的是,源语言句子中的cept.数量不一定等于目标句子中的cept.数量。因为有些cept. 可以为空,因此可以把那些空对的单词看作空cept.。比如,在图\ref{fig:3-32}的实例中,``了''就对应一个空cept.。
%----------------------------------------------
% 图3.6.1
\begin{figure}[htp]
......@@ -1126,9 +1130,9 @@ p_0+p_1 & = & 1 \label{eqC3.62-new}
\parinterval 实际上,上述过程就要先用$t_{[i]}$生成的第一个源语言单词代表整个$t_{[i]}$生成的单词列表,并把第一个源语言单词放置在合适的位置。然后,相对于前一个刚生成的源语言单词,把列表中的其他单词放置在合适的地方。这样就可以在一定程度上保证由同一个目标语言单词生成的源语言单词之间可以相互影响,达到了改进的目的。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{ IBM 模型5}\index{Chapter3.5.4}
\subsection{ IBM 模型5}
\parinterval 模型3和模型4并不是``准确''的模型。这两个模型会把一部分概率分配给一些根本就不存在的句子。这个问题被称作IBM模型3和模型4的{\small\bfnew{缺陷}}(Deficiency)。说的具体一些,模型3和模型4 中并没有这样的约束:如果已经放置了某个源语言单词的位置不能再放置其他单词,也就是说句子的任何位置只能放置一个词,不能多也不能少。由于缺乏这个约束,模型3和模型4中在所有合法的词对齐上概率和不等于1。 这部分缺失的概率被分配到其他不合法的词对齐上。举例来说,如图\ref{fig:3-33}所示,``吃 早饭''和``Have breakfast''之间的合法词对齐用直线表示 。但是在模型3和模型4中, 在它们上的概率和为$0.9<1$。 损失掉的概率被分配到像5和6这样的对齐上了(红色)。虽然IBM模型并不支持一对多的对齐,但是模型3和模型4把概率分配给这些``不合法''的词对齐上,因此也就产生所谓的Deficiency问题。
\parinterval 模型3和模型4并不是``准确''的模型。这两个模型会把一部分概率分配给一些根本就不存在的句子。这个问题被称作IBM模型3和模型4的{\small\bfnew{缺陷}}\index{缺陷}(Deficiency)\index{Deficiency}。说的具体一些,模型3和模型4 中并没有这样的约束:如果已经放置了某个源语言单词的位置不能再放置其他单词,也就是说句子的任何位置只能放置一个词,不能多也不能少。由于缺乏这个约束,模型3和模型4中在所有合法的词对齐上概率和不等于1。 这部分缺失的概率被分配到其他不合法的词对齐上。举例来说,如图\ref{fig:3-33}所示,``吃 早饭''和``Have breakfast''之间的合法词对齐用直线表示 。但是在模型3和模型4中, 在它们上的概率和为$0.9<1$。 损失掉的概率被分配到像5和6这样的对齐上了(红色)。虽然IBM模型并不支持一对多的对齐,但是模型3和模型4把概率分配给这些``不合法''的词对齐上,因此也就产生所谓的Deficiency问题。
%----------------------------------------------
% 图3.5.4
......@@ -1161,7 +1165,7 @@ p_0+p_1 & = & 1 \label{eqC3.62-new}
\parinterval 实际上,模型5和模型4的思想基本一致,即,先确定$\tau_{[i]1}$的绝对位置,然后再确定$\tau_{[i]}$中剩余单词的相对位置。模型5消除了产生不存在的句子的可能性,不过模型5的复杂性也大大增加了。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{隐马尔可夫模型}\index{Chapter3.5.5}
\subsection{隐马尔可夫模型}
\parinterval IBM模型可以得到双语句子间的词对齐,因此也有很多工作在这个模型的基础上对词对齐方法进行改进。其中一个比较有代表性的工作是基于隐马尔可夫模型的方法\cite{vogel1996hmm},它可以被看作是IBM 模型2的升级版本。
......@@ -1231,26 +1235,27 @@ p_0+p_1 & = & 1 \label{eqC3.62-new}
\noindent 其中,$\mu( \cdot )$是隐马尔可夫模型的参数,可以通过训练得到。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{解码和训练}\index{Chapter3.5.5}
\subsection{解码和训练}
\parinterval 和IBM模型1-2一样,IBM模型3-5和隐马尔可夫模型的解码可以直接使用\ref{sec:sentence-level-translation}\\节所描述的方法。基本思路是对译文自左向右生成,每次扩展一个源语言单词的翻译,即把源语言单词的译文放到已经生成的译文的右侧。每次扩展可以选择不同的源语言单词或者同一个源语言单词的不同翻译候选,这样就可以得到多个不同的扩展译文。在这个过程中,同时计算翻译模型和语言模型的得分,对每个得到译文候选打分。最终,保留一个或者多个译文。这个过程重复执行直至所有源语言单词被翻译完。
\parinterval 类似的,IBM模型3-5和隐马尔可夫模型也都可以使用期望最大化(EM)方法进行模型训练。相关数学推导可参考附录\ref{appendix-B}的内容。通常,可以使用这些模型获得双语句子间的词对齐结果,比如著名的GIZA++工具。这时,往往会使用多个模型,把简单的模型训练后的参数作为初始值送给后面更加复杂的模型。比如,先用IBM模型1训练,之后把参数送给IBM模型2,再训练,之后把参数送给隐马尔可夫模型等。值得注意的是,并不是所有的模型使用EM算法都能找到全局最优解。特别是IBM模型3-5的训练中使用一些剪枝和近似的方法,优化的真实目标函数会更加复杂。不过,IBM模型1是一个{\small\bfnew{凸函数}}(Convex function),因此理论上使用EM方法是能找到全局最优解的。更实际的好处是,IBM模型1训练的最终结果与参数的初始化过程无关。这也是为什么在使用IBM系列模型时,往往会使用IBM模型1作为起始模型的原因。
\parinterval 类似的,IBM模型3-5和隐马尔可夫模型也都可以使用期望最大化(EM)方法进行模型训练。相关数学推导可参考附录\ref{appendix-B}的内容。通常,可以使用这些模型获得双语句子间的词对齐结果,比如著名的GIZA++工具。这时,往往会使用多个模型,把简单的模型训练后的参数作为初始值送给后面更加复杂的模型。比如,先用IBM模型1训练,之后把参数送给IBM模型2,再训练,之后把参数送给隐马尔可夫模型等。值得注意的是,并不是所有的模型使用EM算法都能找到全局最优解。特别是IBM模型3-5的训练中使用一些剪枝和近似的方法,优化的真实目标函数会更加复杂。不过,IBM模型1是一个{\small\bfnew{凸函数}}\index{凸函数}(Convex function)\index{Convex function},因此理论上使用EM方法是能找到全局最优解的。更实际的好处是,IBM模型1训练的最终结果与参数的初始化过程无关。这也是为什么在使用IBM系列模型时,往往会使用IBM模型1作为起始模型的原因。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{问题分析}\index{Chapter3.6}%Index的作用,目前不清晰
\sectionnewpage
\section{问题分析}
\parinterval IBM模型是一个时代的经典,但也留下了一些值得思考的问题。这一方面体现了科学技术发展需要一步步前行,而非简单的一蹴而就。另一方面也体现了机器翻译问题的困难程度。下面对IBM存在的问题进行分析,同时给出一些解决问题的思路,希望通过这些讨论可以使我们对机器翻译问题有更深层次的理解。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{词对齐及对称化}\index{Chapter3.6.1}
\subsection{词对齐及对称化}
\parinterval IBM的五个模型都是基于一个词对齐的假设\ \dash \ 一个源语言单词最多只能对齐到一个目标语言单词。这个约束大大化简了IBM模型的建模。最初,Brown等人提出这个假设可能是因为在法英翻译中一对多的对齐情况并不多见,这个假设带来的问题也不是那么严重。但是,在像汉英翻译这样的任务中,一个汉语单词对应多个英语单词的翻译很常见,这时IBM模型的词对齐假设就表现出了明显的问题。比如在翻译``我\ \ \ \ 试一试 。''\ $\to$ \ ``I will have a try .''时,IBM模型根本不能把单词``试一试''对齐到三个单词``have a try'',因而可能无法得到正确的翻译结果。
\parinterval 本质上说,IBM模型词对齐的``不完整''问题是IBM模型本身的缺陷。解决这个问题有很多思路,第一种方法就是,反向训练后,合并源语言单词,然后再正向训练。这里用汉英翻译为例来解释这个方法。首先反向训练,就是把英语当作待翻译语言,而把汉语当作目标语言进行训练(参数估计)。这样可以得到一个词对齐结果(参数估计的中间结果)。在这个词对齐结果里面,一个汉语单词可对应多个英语单词。之后,扫描每个英语句子,如果有多个英语单词对应同一个汉语单词,就把这些英语单词合并成一个英语单词。处理完之后,再把汉语当作源语言而把英语当作目标语言进行训练。这样就可以把一个汉语单词对应到合并的英语单词上。虽然从模型上看,还是一个汉语单词对应一个英语``单词'',但实质上已经把这个汉语单词对应到多个英语单词上了。训练完之后,再利用这些参数进行翻译(解码)时,就能把一个中文单词翻译成多个英文单词了。但是反向训练后再训练也存在一些问题。首先,合并英语单词会使数据变得更稀疏,训练不充分。其次,由于IBM模型的词对齐结果并不是高精度的,利用它的词对齐结果来合并一些英文单词可能造成严重的错误,比如:把本来很独立的几个单词合在了一起。因此,此方法也并不完美。具体使用时还要考虑实际需要和问题的严重程度来决定是否使用这个方法。
\parinterval 另一种方法是双向对齐之后进行词对齐{\small\sffamily\bfseries{对称化}}(Symmetrization)。这个方法可以在IBM词对齐的基础上获得对称的词对齐结果。思路很简单,用正向(汉语为源语言,英语为目标语言)和反向(汉语为目标语言,英语为源语言)同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果(比如,取``并集''、``交集''等),这样就可以得到包含一对多和多对多的词对齐结果。比如,在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天,对称化仍然是很多自然语言处理系统中的一个关键步骤。
\parinterval 另一种方法是双向对齐之后进行词对齐{\small\sffamily\bfseries{对称化}}\index{对称化}(Symmetrization)\index{Symmetrization}。这个方法可以在IBM词对齐的基础上获得对称的词对齐结果。思路很简单,用正向(汉语为源语言,英语为目标语言)和反向(汉语为目标语言,英语为源语言)同时训练。这样可以得到两个词对齐结果。然后利用一些启发性方法用这两个词对齐生成对称的结果(比如,取``并集''、``交集''等),这样就可以得到包含一对多和多对多的词对齐结果。比如,在基于短语的统计机器翻译中已经很成功地使用了这种词对齐信息进行短语的获取。直到今天,对称化仍然是很多自然语言处理系统中的一个关键步骤。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{Deficiency}\index{Chapter3.6.2}
\subsection{Deficiency}
\parinterval Deficiency问题是指翻译模型会把一部分概率分配给一些根本不存在的源语言字符串。如果用$\textrm{P}(\textrm{well}|\mathbf{t})$表示$\textrm{P}(\mathbf{s}| \mathbf{t})$在所有的正确的(可以理解为语法上正确的)$\mathbf{s}$上的和,即
\begin{eqnarray}
......@@ -1271,21 +1276,21 @@ p_0+p_1 & = & 1 \label{eqC3.62-new}
\parinterval IBM的模型5已经解决了Technical Deficiency问题。不过模型5过于复杂。实际上Technical Deficiency问题是不是需要解决,这一点在本节随后的内容中还要进行讨论。Spiritually Deficiency的解决很困难,因为即使对于人来说也很难判断一个句子是不是``良好''的句子。当然可以考虑用语言模型来缓解这个问题,不过由于在翻译的时候源语言句子都是定义``良好''的句子,$\textrm{P}({\textrm{ill}|\mathbf{t}})$$\textrm{P}(\mathbf{s}| \mathbf{t})$的影响并不大。但用输入的源语言句子$\mathbf{s}$的``良好性''并不能解决Technical Deficiency,因为Technical Deficiency是模型的问题或者模型参数估计方法的问题。无论输入什么样的$\mathbf{s}$,模型3和模型4的Technical Deficiency问题都存在。
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\subsection{句子长度}\index{Chapter3.6.4}
\subsection{句子长度}
\parinterval 在IBM模型中,$\textrm{P}(\mathbf{t})\textrm{P}(\mathbf{s}| \mathbf{t})$会随着目标语言句子长度的增加而减少,因为这种生成模型有多个概率化的因素组成,一般乘积项越多结果的值越小。这也就是说,IBM模型会更倾向选择长度短一些的目标语言句子。显然这种对短句子的偏向性并不是我们所期望的。
\parinterval 这个问题在很多统计机器翻译系统中都存在,实际上也是一种{\small\bfnew{系统偏置}}(System Bias)的体现。为了消除这种偏置,可以通过在模型中增加一个短句子惩罚引子来抵消掉模型对短句子的倾向性。比如,可以定义一个惩罚引子,它的值随着长度的减少而增加。不过,简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个判别式框架的翻译模型,这部分内容会在下一章进行介绍。
\parinterval 这个问题在很多统计机器翻译系统中都存在,实际上也是一种{\small\bfnew{系统偏置}}\index{系统偏置}(System Bias)\index{System Bias}的体现。为了消除这种偏置,可以通过在模型中增加一个短句子惩罚引子来抵消掉模型对短句子的倾向性。比如,可以定义一个惩罚引子,它的值随着长度的减少而增加。不过,简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个判别式框架的翻译模型,这部分内容会在下一章进行介绍。
\subsection{其他问题}\index{Chapter3.6.5}
\subsection{其他问题}
\parinterval 模型5的意义是什么?模型5的提出是为了消除模型3和模型4的Deficiency问题。Deficiency问题的本质是,$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$在所有合理的对齐上概率和不为1。 但是,在统计机器翻译中更关心是哪个对齐$\mathbf{a}$使$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$达到最大,即使$\textrm{P}(\mathbf{s},\mathbf{a}|\mathbf{t})$不符合概率分布的定义,也并不影响我们寻找理想的对齐$\mathbf{a}$。从工程的角度说,$\textrm{P}(\mathbf{s},\mathbf{a}| \mathbf{t})$不归一并不是一个十分严重的问题。遗憾的是,实际上到现在为止有太多对IBM模型3和模型4中的Deficiency 问题进行过系统的实验和分析,但对于这个问题到底有多严重并没有定论。当然用模型5是可以解决这个问题。但是如果用一个非常复杂的模型去解决了一个并不产生严重后果的问题,那这个模型也就没有太大意义了(从实践的角度)。
\parinterval 概念(cept.)的意义是什么?经过前面的分析可知,IBM模型的词对齐模型使用了cept.这个概念。但是,在IBM模型中使用的cept.最多只能对应一个目标语言单词(模型并没有用到源语言cept. 的概念)。因此可以直接用单词代替cept.。这样,即使不引入cept.的概念,也并不影响IBM模型的建模。实际上,cept.的引入确实可以帮助我们从语法和语义的角度解释词对齐过程。不过,这个方法在IBM 模型中的效果究竟如何还没有定论。
\section{小结及深入阅读}\index{Chapter3.7}
\sectionnewpage
\section{小结及深入阅读}
\parinterval 本章对IBM系列模型进行了全面的介绍和讨论,从一个简单的基于单词的翻译模型开始,本章以建模、解码、训练多个维度对统计机器翻译进行了描述,期间也涉及了词对齐、优化等多个重要概念。IBM 模型共分为5个模型,对翻译问题的建模依次由浅入深,同时模型复杂度也依次增加。IBM模型作为入门统计机器翻译的``必经之路'',其思想对今天的机器翻译仍然产生着影响。虽然单独使用IBM模型进行机器翻译现在已经不多见,甚至很多从事神经机器翻译等前沿研究的人对IBM模型已经逐渐淡忘,但是不能否认IBM模型标志着一个时代的开始。从某种意义上,当使用公式$\hat{\mathbf{t}} = \argmax_{\mathbf{t}} \textrm{P}(\mathbf{t}|\mathbf{s})$描述机器翻译问题的时候,或多或少都在与IBM模型使用相似的思想。
......
......@@ -11,11 +11,12 @@
\node[anchor=east] (t0) at (-0.5em, -1.5) {$\textbf{t}$:};
\node[anchor=north] (l) at ([xshift=7em,yshift=-0.5em]t0.south) {\footnotesize{(a)\ }};
\end{scope}
\begin{scope}[xshift=14.5em,minimum height = 18pt]
\begin{scope}[xshift=15em,minimum height = 18pt]
\node[anchor=east] (s0) at (-0.5em, 0) {$\textbf{s}$:};
\node[anchor=west,fill=green!20] (s1) at (0, 0) {\footnotesize{桌子 上}};
......@@ -28,11 +29,12 @@
\path[<->, thick] (s2.south) edge (t1.north);
}
\node[anchor=north] (l) at ([xshift=7em,yshift=-0.5em]t0.south) {\footnotesize{(b)\ }};
\end{scope}
\begin{scope}[yshift=-6.0em,minimum height = 18pt]
\begin{scope}[yshift=-9.5em,minimum height = 18pt]
\node[anchor=east] (s0) at (-0.5em, 0) {$\textbf{s}$:};
\node[anchor=west,fill=green!20] (s1) at (0, 0) {\footnotesize{桌子 上}};
......@@ -48,13 +50,13 @@
\node[anchor=west,fill=blue!20] (t2) at ([xshift=1em]t1.east) {\footnotesize{an apple}};
\path[<->, thick] (s3.south) edge (t2.north);
}
\node[anchor=north] (l) at ([xshift=7em,yshift=-0.5em]t0.south) {\footnotesize{(c)\ }};
\end{scope}
\begin{scope}[xshift=14.5em,yshift=-6.0em,minimum height = 18pt]%[scale=0.5]
\begin{scope}[xshift=15em,yshift=-9.5em,minimum height = 18pt]%[scale=0.5]
\node[anchor=east] (s0) at (-0.5em, 0) {$\textbf{s}$:};
\node[anchor=west,fill=green!20] (s1) at (0, 0) {\footnotesize{桌子 上}};
......@@ -74,6 +76,6 @@
\node[anchor=west,fill=green!20] (t3) at ([xshift=1em]t2.east) {\footnotesize{on the table}};
\path[<->, thick] (s1.south) edge (t3.north);
}
\node[anchor=north] (l) at ([xshift=7em,yshift=-0.5em]t0.south) {\footnotesize{(d)\ }};
\end{scope}
\end{tikzpicture}
\ No newline at end of file
......@@ -7,7 +7,7 @@
\node[anchor=north] (q1) at (0,0) {\scriptsize\sffamily\bfseries{输入字符串:}};
\node[anchor=west] (q2) at ([xshift=0em,yshift=-2em]q1.west) {\footnotesize{进口$\quad$$\quad$出口$\quad$大幅度$\quad$下降$\quad$}};
\node[anchor=north,fill=blue!20,minimum height=1em,minimum width=1em] (f1) at ([xshift=-4.1em,yshift=-0.8em]q2.south) {};
%\node[anchor=north,fill=blue!20,minimum height=1em,minimum width=1em] (f1) at ([xshift=-4.1em,yshift=-0.8em]q2.south) {};
\node[anchor=north,fill=blue!20,minimum height=4em,minimum width=1em] (f1) at ([xshift=2.2em,yshift=-0.7em]q2.south) {};
......
......@@ -82,7 +82,7 @@
\node[anchor=west](b8) at ([xshift=0em,yshift=-1.5em]b7.west){{VP}};
\node[anchor=west](b9) at ([xshift=0em,yshift=-1.5em]b8.west){{N/A}};
\node[anchor=west](b10) at ([xshift=0em,yshift=-1.5em]b9.west){{VP}};
\node[anchor=west](b11) at ([xshift=0em,yshift=-1.5em]b10.west){{IP({\red root})}};
\node[anchor=west](b11) at ([xshift=0em,yshift=-1.5em]b10.west){{IP ({\red root})}};
\node[anchor=west](y2) at ([xshift=0.2em,yshift=-1.7em]y1.west){{}};
\node[anchor=west](y3) at ([xshift=0em,yshift=-1.5em]y2.west){{喜欢}};
......
......@@ -31,7 +31,7 @@
\node [anchor=south] (label) at ([yshift=0.3em]sw13.north) {\footnotesize{在跨度[{\blue 0},{\blue 13}]上匹配``NP 对 NP VP''}};
\node [anchor=north west,minimum size=1.2em,fill=red!20] (np) at ([yshift=-1.0em,xshift=0.3em]sw41.south west) {};
\node [anchor=west] (nplabel) at (np.east) {NP(第二个)};
\node [anchor=west] (nplabel) at (np.east) {NP(第二个)};
\node [anchor=west,minimum size=1.2em,fill=blue!20] (vp) at ([xshift=1.0em]nplabel.east) {};
\node [anchor=west] (vplabel) at (vp.east) {VP};
......
......@@ -10,7 +10,7 @@
\node [anchor=north,fill=red!20] (s2) at ([xshift=4em,yshift=0em]s1.north) {大幅度};
\node[anchor=north,fill=blue!20] (s3) at ([xshift=4.5em,yshift=0em]s2.north) {下降 了};
\node[anchor=west,fill=green!20] (t1) at ([xshift=0em,yshift=-4em]s1.west) {the imports have};
\node[anchor=west,fill=green!20] (t1) at ([xshift=0em,yshift=-4em]s1.west) {The imports have};
\node[anchor=north,fill=red!20] (t2) at ([xshift=8em,yshift=0em]t1.north) {drastically};
\node[anchor=north,fill=blue!20] (t3) at ([xshift=5.7em,yshift=0em]t2.north) {fallen};
......
......@@ -27,7 +27,7 @@
}
}
{
\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h4) at ([yshift=-5.5em]h0.south west) {\small{null}};
\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h4) at ([yshift=-7em]h0.south west) {\small{null}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h5) at ([xshift=2.2em]h4.east) {\small{he}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h6) at ([xshift=2.2em,yshift=3.5em]h4.east) {\small{it}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h8) at ([xshift=2.2em]h6.east) {\small{is not}};
......@@ -54,6 +54,8 @@
}
}
\node[anchor=north] (l1) at ([xshift=6em,yshift=-1em]h0.south) {\scriptsize{(a)\ 原假设(译文相同时)}};
\node[anchor=north] (l2) at ([xshift=6em,yshift=-1em]h4.south) {\scriptsize{(c)\ 原假设(译文不同时)}};
%\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em,opacity=0.7] (h1) at ([xshift=-1em,yshift=2em]h2.north) {原假设};
\end{scope}
......@@ -85,7 +87,7 @@
}
}
{
\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h4) at ([yshift=-5.5em]h0.south west) {\small{null}};
\node [anchor=north west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h4) at ([yshift=-7em]h0.south west) {\small{null}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h5) at ([xshift=2.2em]h4.east) {\small{he}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h6) at ([xshift=2.2em,yshift=3.5em]h4.east) {\small{it}};
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em] (h8) at ([xshift=2.2em]h6.east) {\small{is not}};
......@@ -130,6 +132,8 @@
\node [anchor=west] (l21) at ([xshift=0em, yshift=-1em]l2.west) {\footnotesize{较低假设}};
%\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=2em,minimum width=3em,opacity=0.7] (h1) at ([xshift=-1em,yshift=2em]h2.north) {重组假设};
\node[anchor=north] (l1) at ([xshift=6em,yshift=-1em]h0.south) {\scriptsize{(c)\ 重组假设(译文相同时)}};
\node[anchor=north] (l2) at ([xshift=6em,yshift=-1em]h4.south) {\scriptsize{(d)\ 重组假设(译文不同时)}};
\end{scope}
......
......@@ -51,10 +51,10 @@
\end{pgfonlayer}
{
\node [anchor=south] (rule1label) at ([xshift=1em]rule1s.north west) {\footnotesize\sffamily\bfseries{\red{正确的规则}}};
\node [anchor=south] (rule1label) at ([xshift=1em]rule1s.north west) {{\footnotesize\red{正确的规则}}};
}
{
\node [anchor=north west,align=left] (rule2label) at (rule2s.north east) {\footnotesize{\sffamily\bfseries{\color{blue} 错误的规则}}\\\footnotesize{因为``satisfied''会}\\\footnotesize{对齐到规则外,}\\\footnotesize{也就是这条规则}\\\footnotesize{与词对齐不相容}};
\node [anchor=north west,align=left] (rule2label) at (rule2s.north east) {\footnotesize{{\color{blue} 错误的规则}}\\\footnotesize{因为``satisfied''会}\\\footnotesize{对齐到规则外,}\\\footnotesize{也就是这条规则}\\\footnotesize{与词对齐不相容}};
}
\end{scope}
......
......@@ -27,7 +27,7 @@
\node[anchor=north west] (input) at ([yshift=-6.5em]synhifst.south west) {\sffamily\bfseries{源语句法树:}};
\begin{scope}[scale = 0.9, grow'=up, sibling distance=5pt, level distance=23pt, xshift=3.49in, yshift=-2.8in]
\begin{scope}[scale = 0.9, grow'=up, sibling distance=5pt, level distance=30pt, xshift=3.49in, yshift=-3.1in]
\Tree[.\node(tn1){IP};
[.\node(tn2){NP}; \edge[roof]; \node[](seg1){中国$_1$ 明星$_2$ 艺术团$_3$}; ]
......
%------------------------------------------------------------------------------------------------------------
%%% 立方剪枝
\vspace{0.8em}
\begin{tikzpicture}
\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.4em,text=white,inner sep=0.1pt]
\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
\tikzstyle{srcnode} = [rotate=45,anchor=south west]
\begin{scope}[scale=0.85]
\node [anchor=west] (s1) at (0,0) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
\node [anchor=east] (s4) at ([yshift=-2em]s3.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{through}\ \textrm{X}_1>$}};
\node [anchor=center,alignmentnode] (alig1) at ([xshift=-3.5em,yshift=8em]s1.north) {};
\node [anchor=center,alignmentnode] (alig11) at ([xshift=2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig12) at ([xshift=2.2em]alig11.center) {};
\node [anchor=center,alignmentnode] (alig13) at ([xshift=2.2em]alig12.center) {};
\node [anchor=center,alignmentnode] (alig2) at ([yshift=-2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig21) at ([xshift=2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig22) at ([xshift=2.2em]alig21.center) {};
\node [anchor=center,alignmentnode] (alig23) at ([xshift=2.2em]alig22.center) {};
\node [anchor=center,alignmentnode] (alig3) at ([yshift=-2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig31) at ([xshift=2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig32) at ([xshift=2.2em]alig31.center) {};
\node [anchor=center,alignmentnode] (alig33) at ([xshift=2.2em]alig32.center) {};
\node [anchor=center,alignmentnode] (alig4) at ([yshift=-2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig41) at ([xshift=2.2em]alig4.center) {};
\node [anchor=center,alignmentnode] (alig42) at ([xshift=2.2em]alig41.center) {};
\node [anchor=center,alignmentnode] (alig43) at ([xshift=2.2em]alig42.center) {};
\node[srcnode] (c1) at ([yshift=1em]alig1.north) {\footnotesize{plan}};
\node[srcnode] (c2) at ([yshift=1em]alig11.north) {\footnotesize{scheme}};
\node[srcnode] (c3) at ([yshift=1em]alig12.north) {\footnotesize{project}};
\node[srcnode] (c4) at ([yshift=1em]alig13.north) {\footnotesize{times}};
{
\node [anchor=center,selectnode] (c1) at (alig1.center) {\footnotesize{2.1}};
}
{
\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
}
{
\node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
\node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode,fill=red!20] (c6) at (alig3.center) {\footnotesize{7.7}};
}
{
\node [anchor=center,selectnode] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode] (c6) at (alig3.center) {\footnotesize{7.7}};
\node [anchor=center,selectnode,fill=red!20] (c7) at (alig22.center) {\footnotesize{4.2}};
\node [anchor=center,selectnode,fill=red!20] (c8) at (alig31.center) {\footnotesize{8.2}};
}
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\end{scope}
\end{tikzpicture}
%------------------------------------------------------------------------------------------------------------
%%% 立方剪枝
\vspace{0.8em}
\begin{tikzpicture}
\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.4em,text=white,inner sep=0.1pt]
\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
\tikzstyle{srcnode} = [rotate=45,anchor=south west]
\begin{scope}[scale=0.85]
\node [anchor=west] (s1) at (0,0) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
\node [anchor=east] (s4) at ([yshift=-2em]s3.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{through}\ \textrm{X}_1>$}};
\node [anchor=center,alignmentnode] (alig1) at ([xshift=-3.5em,yshift=8em]s1.north) {};
\node [anchor=center,alignmentnode] (alig11) at ([xshift=2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig12) at ([xshift=2.2em]alig11.center) {};
\node [anchor=center,alignmentnode] (alig13) at ([xshift=2.2em]alig12.center) {};
\node [anchor=center,alignmentnode] (alig2) at ([yshift=-2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig21) at ([xshift=2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig22) at ([xshift=2.2em]alig21.center) {};
\node [anchor=center,alignmentnode] (alig23) at ([xshift=2.2em]alig22.center) {};
\node [anchor=center,alignmentnode] (alig3) at ([yshift=-2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig31) at ([xshift=2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig32) at ([xshift=2.2em]alig31.center) {};
\node [anchor=center,alignmentnode] (alig33) at ([xshift=2.2em]alig32.center) {};
\node [anchor=center,alignmentnode] (alig4) at ([yshift=-2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig41) at ([xshift=2.2em]alig4.center) {};
\node [anchor=center,alignmentnode] (alig42) at ([xshift=2.2em]alig41.center) {};
\node [anchor=center,alignmentnode] (alig43) at ([xshift=2.2em]alig42.center) {};
\node[srcnode] (c1) at ([yshift=1em]alig1.north) {\footnotesize{plan}};
\node[srcnode] (c2) at ([yshift=1em]alig11.north) {\footnotesize{scheme}};
\node[srcnode] (c3) at ([yshift=1em]alig12.north) {\footnotesize{project}};
\node[srcnode] (c4) at ([yshift=1em]alig13.north) {\footnotesize{times}};
{
\node [anchor=center,selectnode] (c1) at (alig1.center) {\footnotesize{2.1}};
}
{
\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
}
{
\node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
\node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode,fill=red!20] (c6) at (alig3.center) {\footnotesize{7.7}};
}
{
\node [anchor=center,selectnode] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode] (c6) at (alig3.center) {\footnotesize{7.7}};
\node [anchor=center,selectnode,fill=red!20] (c7) at (alig22.center) {\footnotesize{4.2}};
\node [anchor=center,selectnode,fill=red!20] (c8) at (alig31.center) {\footnotesize{8.2}};
}
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\end{scope}
\end{tikzpicture}
%------------------------------------------------------------------------------------------------------------
%%% 立方剪枝
\vspace{0.8em}
\begin{tikzpicture}
\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.4em,text=white,inner sep=0.1pt]
\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
\tikzstyle{srcnode} = [rotate=45,anchor=south west]
\begin{scope}[scale=0.85]
\node [anchor=west] (s1) at (0,0) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
\node [anchor=east] (s4) at ([yshift=-2em]s3.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{through}\ \textrm{X}_1>$}};
\node [anchor=center,alignmentnode] (alig1) at ([xshift=-3.5em,yshift=8em]s1.north) {};
\node [anchor=center,alignmentnode] (alig11) at ([xshift=2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig12) at ([xshift=2.2em]alig11.center) {};
\node [anchor=center,alignmentnode] (alig13) at ([xshift=2.2em]alig12.center) {};
\node [anchor=center,alignmentnode] (alig2) at ([yshift=-2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig21) at ([xshift=2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig22) at ([xshift=2.2em]alig21.center) {};
\node [anchor=center,alignmentnode] (alig23) at ([xshift=2.2em]alig22.center) {};
\node [anchor=center,alignmentnode] (alig3) at ([yshift=-2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig31) at ([xshift=2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig32) at ([xshift=2.2em]alig31.center) {};
\node [anchor=center,alignmentnode] (alig33) at ([xshift=2.2em]alig32.center) {};
\node [anchor=center,alignmentnode] (alig4) at ([yshift=-2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig41) at ([xshift=2.2em]alig4.center) {};
\node [anchor=center,alignmentnode] (alig42) at ([xshift=2.2em]alig41.center) {};
\node [anchor=center,alignmentnode] (alig43) at ([xshift=2.2em]alig42.center) {};
\node[srcnode] (c1) at ([yshift=1em]alig1.north) {\footnotesize{plan}};
\node[srcnode] (c2) at ([yshift=1em]alig11.north) {\footnotesize{scheme}};
\node[srcnode] (c3) at ([yshift=1em]alig12.north) {\footnotesize{project}};
\node[srcnode] (c4) at ([yshift=1em]alig13.north) {\footnotesize{times}};
{
\node [anchor=center,selectnode] (c1) at (alig1.center) {\footnotesize{2.1}};
}
{
\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
}
{
\node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
\node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode,fill=red!20] (c6) at (alig3.center) {\footnotesize{7.7}};
}
{
\node [anchor=center,selectnode] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode] (c6) at (alig3.center) {\footnotesize{7.7}};
\node [anchor=center,selectnode,fill=red!20] (c7) at (alig22.center) {\footnotesize{4.2}};
\node [anchor=center,selectnode,fill=red!20] (c8) at (alig31.center) {\footnotesize{8.2}};
}
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\end{scope}
\end{tikzpicture}
%------------------------------------------------------------------------------------------------------------
%%% 立方剪枝
\vspace{0.8em}
\begin{tikzpicture}
\tikzstyle{alignmentnode} = [rectangle,fill=blue!30,minimum size=0.4em,text=white,inner sep=0.1pt]
\tikzstyle{selectnode} = [rectangle,fill=green!20,minimum height=1.5em,minimum width=1.5em,inner sep=1.2pt]
\tikzstyle{srcnode} = [rotate=45,anchor=south west]
\begin{scope}[scale=0.85]
\node [anchor=west] (s1) at (0,0) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
\node [anchor=east] (s4) at ([yshift=-2em]s3.east) {\footnotesize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{through}\ \textrm{X}_1>$}};
\node [anchor=center,alignmentnode] (alig1) at ([xshift=-3.5em,yshift=8em]s1.north) {};
\node [anchor=center,alignmentnode] (alig11) at ([xshift=2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig12) at ([xshift=2.2em]alig11.center) {};
\node [anchor=center,alignmentnode] (alig13) at ([xshift=2.2em]alig12.center) {};
\node [anchor=center,alignmentnode] (alig2) at ([yshift=-2.2em]alig1.center) {};
\node [anchor=center,alignmentnode] (alig21) at ([xshift=2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig22) at ([xshift=2.2em]alig21.center) {};
\node [anchor=center,alignmentnode] (alig23) at ([xshift=2.2em]alig22.center) {};
\node [anchor=center,alignmentnode] (alig3) at ([yshift=-2.2em]alig2.center) {};
\node [anchor=center,alignmentnode] (alig31) at ([xshift=2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig32) at ([xshift=2.2em]alig31.center) {};
\node [anchor=center,alignmentnode] (alig33) at ([xshift=2.2em]alig32.center) {};
\node [anchor=center,alignmentnode] (alig4) at ([yshift=-2.2em]alig3.center) {};
\node [anchor=center,alignmentnode] (alig41) at ([xshift=2.2em]alig4.center) {};
\node [anchor=center,alignmentnode] (alig42) at ([xshift=2.2em]alig41.center) {};
\node [anchor=center,alignmentnode] (alig43) at ([xshift=2.2em]alig42.center) {};
\node[srcnode] (c1) at ([yshift=1em]alig1.north) {\footnotesize{plan}};
\node[srcnode] (c2) at ([yshift=1em]alig11.north) {\footnotesize{scheme}};
\node[srcnode] (c3) at ([yshift=1em]alig12.north) {\footnotesize{project}};
\node[srcnode] (c4) at ([yshift=1em]alig13.north) {\footnotesize{times}};
{
\node [anchor=center,selectnode] (c1) at (alig1.center) {\footnotesize{2.1}};
}
{
\node [anchor=center,selectnode,fill=red!20] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode,fill=red!20] (c3) at (alig2.center) {\footnotesize{5.5}};
}
{
\node [anchor=center,selectnode] (c2) at (alig11.center) {\footnotesize{5.1}};
\node [anchor=center,selectnode] (c3) at (alig2.center) {\footnotesize{5.5}};
\node [anchor=center,selectnode,fill=red!20] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode,fill=red!20] (c6) at (alig3.center) {\footnotesize{7.7}};
}
{
\node [anchor=center,selectnode] (c5) at (alig21.center) {\footnotesize{8.5}};
\node [anchor=center,selectnode] (c6) at (alig3.center) {\footnotesize{7.7}};
\node [anchor=center,selectnode,fill=red!20] (c7) at (alig22.center) {\footnotesize{4.2}};
\node [anchor=center,selectnode,fill=red!20] (c8) at (alig31.center) {\footnotesize{8.2}};
}
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\end{scope}
\end{tikzpicture}
......@@ -39,6 +39,8 @@
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\node[anchor=north] (l) at ([xshift=0em,yshift=-1.5em]alig4.south) {\scriptsize{(a)}};
\end{scope}
%图2
......@@ -84,10 +86,12 @@
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\node[anchor=north] (l) at ([xshift=0em,yshift=-1.5em]alig4.south) {\scriptsize{(b)}};
\end{scope}
%图3
\begin{scope}[yshift=-12.0em,scale=0.85]
\begin{scope}[yshift=-13.0em,scale=0.85]
\node [anchor=west] (s1) at (0,0) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
......@@ -137,11 +141,13 @@
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\node[anchor=north] (l) at ([xshift=0em,yshift=-1.5em]alig4.south) {\scriptsize{(c)}};
\end{scope}
%图4
\begin{scope}[xshift=18.0em,yshift=-12.0em,scale=0.85]
\begin{scope}[xshift=18.0em,yshift=-13.0em,scale=0.85]
\node [anchor=west] (s1) at (0,0) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from}\ \textrm{X}_1>$}};
\node [anchor=east] (s2) at ([yshift=-2em]s1.east) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{since}\ \textrm{X}_1>$}};
\node [anchor=east] (s3) at ([yshift=-2em]s2.east) {\scriptsize{$\textrm{X} \to <\textrm{}\ \textrm{X}_1,\ \textrm{from the}\ \textrm{X}_1>$}};
......@@ -197,6 +203,8 @@
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=-1.0em,yshift=-0.7em]alig4.south west);
\draw [->,thick] ([xshift=-1.0em,yshift=1.0em]alig1.north west)--([xshift=0.8em,yshift=1.0em]alig13.north east);
\node[anchor=north] (l) at ([xshift=0em,yshift=-1.5em]alig4.south) {\scriptsize{(d)}};
\end{scope}
......
......@@ -67,11 +67,11 @@
\draw[-] (rules.south west)--([xshift=1.9in]rules.south west);
{
\node[anchor=north west] (p1) at ([yshift=-0.3em]phrase.south west) {天气真好 -- The weather is very good};
\node[anchor=north west] (p1) at ([yshift=-0.3em]phrase.south west) {天气\ \ 真好 -- The weather is very good};
}
{
\node[anchor=north west] (r1) at ([yshift=-0.3em]rules.south west) {$\mathrm{X_1}$真好 -- $\mathrm{X_1}$ is very good};
\node[anchor=north west] (r1) at ([yshift=-0.3em]rules.south west) {$\mathrm{X_1}$\ \ 真好 -- $\mathrm{X_1}$ is very good};
\node[anchor=east] (r2) at ([yshift=-2.65cm]p1.east) {};
}
......
......@@ -105,7 +105,7 @@
\end{flushright}
\begin{center}
\vspace{-1em}
(a) Sub-tree alignment matrixes for a sample sub-tree pair
(a)节点对齐矩阵(1-best vs. Matrix)
\end{center}
\begin{center}
......@@ -147,7 +147,7 @@
\begin{center}
\vspace{-2em}
(b) Rules extracted using 1-best alignment and alignment posterior.
(b) 抽取得到的树到树翻译规则
\end{center}
\end{center}
......@@ -45,8 +45,8 @@
\draw[-,thick] (s2.north west)--([yshift=0.3in]s2.north west);
\draw[->,densely dotted,thick] ([yshift=0.3in]s2.north west)--([xshift=-0.3in,yshift=0.3in]s2.north west);
\node[anchor=south] (ld1) at ([xshift=-0.5em,yshift=0.4em]n1.north) {\small{$dr$=-5}};
\node[anchor=south] (ld2) at ([xshift=6.5em,yshift=0.4em]n1.north) {\small{$dr$=+4}};
\node[anchor=south] (ld1) at ([xshift=-0.5em,yshift=0.4em]n1.north) {\small{$dr= -5$}};
\node[anchor=south] (ld2) at ([xshift=6.5em,yshift=0.4em]n1.north) {\small{$dr= +4$}};
\end{scope}
\end{tikzpicture}
......
......@@ -3,7 +3,7 @@
\begin{center}
\begin{tikzpicture}
{\scriptsize
{\footnotesize
\begin{scope}[sibling distance=4pt, level distance=25pt]
\Tree[.\node(n1){NP};
......@@ -22,7 +22,7 @@
\draw [-,dashed] (sw3.south) -- (tw3.north);
\draw [-,dashed] (sw4.south) -- (tw3.north);
\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\tiny{二叉化}};
\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\scriptsize{二叉化}};
\end{scope}
......
......@@ -14,7 +14,7 @@
\node [anchor=west] (tw1) at ([xshift=3.5em]sn3.east) {increases};
\node [anchor=west,fill=red!20] (tw2) at ([xshift=0.3em]tw1.east) {NN};
\draw[dotted,thick] ([yshift=-0.1em]sn3.south)..controls +(south:1.2) and +(south: 1.2)..([yshift=-0.1em]tw2.south);
\draw[dotted,thick,<->] ([yshift=-0.1em]sn3.south)..controls +(south:1.2) and +(south: 1.2)..([yshift=-0.1em]tw2.south);
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0em,fill=red!20] [fit = (sn3)] (nn1) {};
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -153,6 +153,10 @@
%表1------------------------
%--5.2神经网络基础-----------------------------------------
<<<<<<< HEAD
=======
\sectionnewpage
>>>>>>> master
\section{神经网络基础}
\parinterval 神经网络是一种由大量的节点(或称神经元)之间相互连接构成的计算模型。那么什么是神经元?神经元之间又是如何连接的?神经网络的数学描述又是什么样的?这一节将围绕这些问题对神经网络的基础知识作进行系统的介绍。
......@@ -833,6 +837,10 @@ x_0\cdot w_0+x_1\cdot w_1+x_2\cdot w_2 & = & 0\cdot 1+0\cdot 1+1\cdot 1 \nonumbe
\parinterval 在本书后面的内容中还会看到,深层网络在机器翻译中可以带来明显的性能提升。
%--5.3神经网络的张量实现-----------------------------------------
<<<<<<< HEAD
=======
\sectionnewpage
>>>>>>> master
\section{神经网络的张量实现}
\parinterval 在神经网络内部,输入经过若干次变换,最终得到输出的结果。这个过程类似于一种逐层的数据``流动''。不禁会产生这样的疑问:在神经网络中,数据是以哪种形式``流动''的?如何去编程实现这种数据``流动''呢?
......@@ -986,7 +994,7 @@ f(x)=\begin{cases} 0 & x\le 0 \\x & x>0\end{cases}
\begin{figure}[htp]
\centering
\input{./Chapter5/Figures/fig-save}
\caption{1阶、2阶、3阶张量的物理存储}
\caption{1阶(a)、2阶(b)、3阶张量(c)的物理存储}
\label{fig:save}
\end{figure}
%-------------------------------------------
......@@ -1201,6 +1209,10 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mat
%-------------------------------------------
%--5.4神经网络的参数训练-----------------------------------------
<<<<<<< HEAD
=======
\sectionnewpage
>>>>>>> master
\section{神经网络的参数训练}
\parinterval 简单来说,神经网络可以被看作是由变量和函数组成的表达式,例如:$ \mathbf y=\mathbf x+\mathbf b $$ \mathbf y={\rm{ReLU}}(\mathbf x\cdot \mathbf w+\mathbf b) $$ \mathbf y={\rm{Sigmoid}}({\rm{ReLU}}(\mathbf x\cdot \mathbf w^1+\mathbf b^1)\cdot \mathbf w^2+\mathbf b^2) $等等,其中的$ \mathbf x $$ \mathbf y $作为输入和输出变量, $ \mathbf w $$ \mathbf b $等其他变量作为{\small\sffamily\bfseries{模型参数}}\index{模型参数}(Model Parameters)\index{Model Parameters}。确定了函数表达式和模型参数,也就确定了神经网络模型。通常,表达式的形式需要系统开发者设计,而模型参数的数量有时会非常巨大,因此需要自动学习,这个过程也被称为模型学习或{\small\bfnew{训练}}\index{训练}(Training)\index{Training}。为了实现这个目标,通常会准备一定量的带有标准答案的数据,称之为{\small\sffamily\bfseries{有标注数据}}\index{有标注数据}(Annotated Data/Labeled Data)\index{Annotated Data/Labeled Data}。这些数据会用于对模型参数的学习,这也对应了统计模型中的参数估计过程。在机器学习中,一般把这种使用有标注数据进行统计模型参数训练的过程称为{\small\sffamily\bfseries{有指导的训练}}\index{有指导的训练}{\small\sffamily\bfseries{有监督的训练}}\index{有监督的训练}(Supervised Training)\index{Supervised Training}。在本章中,如果没有特殊说明,模型训练都是指有监督的训练。那么神经网络内部是怎样利用有标注数据对参数进行训练的呢?
......@@ -1917,6 +1929,10 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}} v_t
%-------------------------------------------
%--5.5神经语言模型-----------------------------------------
<<<<<<< HEAD
=======
\sectionnewpage
>>>>>>> master
\section{神经语言模型}\label{sec5:nlm}
\parinterval 神经网络给我们提供了一种工具,只要将问题的输入和输出定义好,就可以学习输入和输出之间的对应关系。显然,很多自然语言处理任务都可以用神经网络进行实现。比如,在机器翻译中,可以把输入的源语言句子和输出的目标语言句子用神经网络建模;在文本分类中,可以把输入的文本内容和输出的类别标签进行神经网络建模,等等。
......@@ -2283,6 +2299,10 @@ Jobs was the CEO of {\red{\underline{apple}}}.
%-------------------------------------------
%--5.6小结及深入阅读-----------------------------------------
<<<<<<< HEAD
=======
\sectionnewpage
>>>>>>> master
\section{小结及深入阅读}
\parinterval 神经网络为解决自然语言处理问题提供了全新的思路。而所谓深度学习也是建立在多层神经网络结构之上的一系列模型和方法。本章从神经网络的基本概念到其在语言建模中的应用进行了概述。由于篇幅所限,这里无法覆盖所有神经网络和深度学习的相关内容,感兴趣的读者可以进一步阅读《Neural Network Methods in Natural Language Processing》\cite{goldberg2017neural}和《Deep Learning》\cite{lecun2015deep}。此外,也有很多研究方向值得关注:
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -7,11 +7,11 @@
\begin{tikzpicture}
\newlength{\mystep}
\newlength{\wseg}
\newlength{\hseg}
\newlength{\wnode}
\newlength{\hnode}
%\newlength{\mystep}
%\newlength{\wseg}
%\newlength{\hseg}
%\newlength{\wnode}
%\newlength{\hnode}
\setlength{\wseg}{1.5cm}
\setlength{\hseg}{1.0cm}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
%%%------------------------------------------------------------------------------------------------------------
%%% 调序模型1:基于距离的调序
\begin{center}
\begin{tikzpicture}
\begin{scope}[minimum height = 20pt]
\node [anchor=east] (x1) at (-0.5em, 0) {$x_l$};
\node [anchor=west,draw=green,fill=green!20,inner xsep=5pt] (F1) at ([xshift=2em]x1.east){$\textrm{F}$};
\node [anchor=west,circle,draw,minimum size=1em] (n1) at ([xshift=2em]F1.east) {};
\node [anchor=west,draw=green,fill=green!20,inner xsep=5pt] (ln1) at ([xshift=2em]n1.east){\textrm{LN}};
\node [anchor=west] (x2) at ([xshift=2em]ln1.east) {$x_{l+l}$};
\node [anchor=north] (x3) at ([yshift=-5em]x1.south) {$x_l$};
\node [anchor=west,draw=green,fill=green!20,inner xsep=5pt] (F2) at ([xshift=2em]x3.east){$\textrm{F}$};
\node [anchor=west,draw=green,fill=green!20,inner xsep=5pt] (ln2) at ([xshift=2em]F2.east){\textrm{LN}};
\node [anchor=west,circle,draw,,minimum size=1em] (n2) at ([xshift=2em]ln2.east){};
\node [anchor=west] (x4) at ([xshift=2em]n2.east) {$x_{l+l}$};
\draw[->, line width=1pt] ([xshift=-0.1em]x1.east)--(F1.west);
\draw[->, line width=1pt] ([xshift=-0.1em]F1.east)--(n1.west);
\draw[->, line width=1pt] (n1.east)--node[above]{$y_l$}(ln1.west);
\draw[->, line width=1pt] ([xshift=-0.1em]ln1.east)--(x2.west);
\draw[->, line width=1pt] ([xshift=-0.1em]x3.east)--(F2.west);
\draw[->, line width=1pt] ([xshift=-0.1em]F2.east)--(ln2.west);
\draw[->, line width=1pt] ([xshift=0.1em]ln2.east)--node[above]{$y_l$}(n2.west);
\draw[->, line width=1pt] (n2.east)--(x4.west);
\draw[->, line width=1pt] (x1.north) -- ([yshift=1em]x1.north) -- ([yshift=1.4em]n1.north) -- (n1.north);
\draw[->, line width=1pt] (x3.north) -- ([yshift=1em]x3.north) -- ([yshift=1.4em]n2.north) -- (n2.north);
\draw[-] (n1.west)--(n1.east);
\draw[-] (n1.north)--(n1.south);
\draw[-] (n2.west)--(n2.east);
\draw[-] (n2.north)--(n2.south);
\node [anchor=south] (k1) at ([yshift=-0.1em]x1.north){};
\node [anchor=south] (k2) at ([yshift=-0.1em]x3.north){};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.3em,fill=orange!10] [fit = (x1) (F1) (n1) (ln1) (x2) (k1)] (box0) {};
\node [rectangle,inner sep=0.3em,fill=blue!10] [fit = (x3) (F2) (n2) (ln2) (x4) (k2)] (box1) {};
\end{pgfonlayer}
\node [anchor=north] (c1) at (box0.south){\small (a)后作方式的残差连接};
\node [anchor=north] (c2) at (box1.south){\small (b)前作方式的残差连接};
\end{scope}
\end{tikzpicture}
\end{center}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\node [anchor=north,rectangle, inner sep=0mm,minimum height=1.2em,minimum width=2em,rounded corners=5pt,thick] (n1) at (0, 0) {编码端};
\node [anchor=west,rectangle, inner sep=0mm,minimum height=1.2em,minimum width=0em,rounded corners=5pt,thick] (n2) at ([xshift=3.5em,yshift=-0.5em]n1.east) {$x_1$};
\node [anchor=west,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=3em,fill=orange!20,rounded corners=5pt,thick] (n3) at ([xshift=3.5em,yshift=0em]n2.east) {$x_2$};
\node [anchor=west,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=3em,fill=orange!20,rounded corners=5pt,thick] (n4) at ([xshift=3.5em,yshift=0em]n3.east) {$x_3$};
\node [anchor=west,rectangle, inner sep=0mm,minimum height=1.2em,minimum width=1em,rounded corners=5pt,thick] (n6) at ([xshift=1.5em,yshift=0em]n4.east) {$\ldots$};
\node [anchor=west,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=3em,fill=orange!20,rounded corners=5pt,thick] (n5) at ([xshift=3.5em,yshift=0em]n6.east) {$x_{l+1}$};
\node [anchor=west,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=3em,fill=orange!20,rounded corners=5pt,thick] (n7) at ([xshift=1.5em,yshift=0em]n5.east) {$x_{l+2}$};
\node [anchor=north,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=15em,fill=teal!17,rounded corners=5pt,thick] (n8) at ([xshift=0em,yshift=-3em]n4.south) {层正则化};
\node [anchor=north,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=15em,fill=purple!17,rounded corners=5pt,thick] (n9) at ([xshift=0em,yshift=-1em]n8.south) {$z_0\ \quad z_1\ \quad z_2\quad \ldots \quad\ z_l$};
\node [anchor=north,rectangle,draw, inner sep=0mm,minimum height=1.2em,minimum width=15em,fill=teal!17,rounded corners=5pt,thick] (n10) at ([xshift=0em,yshift=-2em]n9.south) {权重累加};
\node [anchor=west,rectangle, inner sep=0mm,minimum height=1.2em, rounded corners=5pt,thick] (n11) at ([xshift=0em,yshift=-4.5em]n1.west) {聚合网络};
\node [anchor=east,rectangle, inner sep=0mm,minimum height=1.2em,minimum width=9em,rounded corners=5pt,thick] (n12) at ([xshift=0em,yshift=-4.5em]n7.east) {};
\node [anchor=south,rectangle, inner sep=0mm,minimum height=1em,minimum width=1em,rounded corners=5pt,thick] (n13) at ([xshift=0em,yshift=1em]n8.north) {};
\begin{pgfonlayer}{background}
{
\node[rectangle,inner sep=2pt,fill=blue!7] [fit = (n1) (n7) (n13)] (bg1) {};
\node[rectangle,inner sep=2pt,fill=red!7] [fit = (n10) (n8) (n11) (n12)] (bg2) {};
}
\end{pgfonlayer}
\draw[->,thick] ([xshift=0.5em,yshift=-0em]n2.south)..controls +(south:2em) and +(north:2em)..([xshift=-0em,yshift=-0em]n8.north) ;
\draw[->,thick] ([xshift=-0em,yshift=-0em]n3.south)..controls +(south:2em) and +(north:2em)..([xshift=-0em,yshift=-0em]n8.north) ;
\draw[->,thick] ([xshift=-0em,yshift=-0em]n5.south)..controls +(south:2em) and +(north:2em)..([xshift=-0em,yshift=-0em]n8.north) ;
\draw [->,thick] ([xshift=0em,yshift=0em]n4.south) -- ([xshift=0em,yshift=0em]n8.north);
\draw [->,thick] ([xshift=0em,yshift=0em]n8.south) -- ([xshift=0em,yshift=0em]n9.north);
\draw[->,thick] ([xshift=-4.5em,yshift=-0em]n9.south)..controls +(south:0.8em) and +(north:0.8em)..([xshift=-0em,yshift=-0em]n10.north) ;
\draw[->,thick] ([xshift=-2em,yshift=-0em]n9.south)..controls +(south:0.8em) and +(north:0.8em)..([xshift=-0em,yshift=-0em]n10.north) ;
\draw[->,thick] ([xshift=0em,yshift=-0em]n9.south)..controls +(south:0.8em) and +(north:0.8em)..([xshift=-0em,yshift=-0em]n10.north) ;
\draw[->,thick] ([xshift=4.5em,yshift=-0em]n9.south)..controls +(south:0.8em) and +(north:0.8em)..([xshift=-0em,yshift=-0em]n10.north) ;
\draw[->,thick] ([xshift=0em,yshift=-0em]n10.east)..controls +(east:5em) and +(south:1.5em)..([xshift=-0em,yshift=-0em]n7.south) ;
\end{scope}
\end{tikzpicture}
\ No newline at end of file
%%%------------------------------------------------------------------------------------------------------------
%%% 短语系统的问题 - 一个实例
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=0.7]
\node [anchor=east,fill=red!50,draw,rounded corners=3pt] (s11) at (-0.5em, 0) {sublayer1};
\node [anchor=west,draw,circle,line width=1pt] (c11) at ([xshift=2em]s11.east) {};
\node [anchor=north,fill=red!10,draw,dashed,rounded corners=3pt] (s21) at ([yshift=-3em]s11.south) {sublayer1};
\node [anchor=west, draw,circle,dashed,line width=1pt] (c21) at ([xshift=2em]s21.east) {};
\node [anchor=west,fill=red!10,draw,dashed,rounded corners=3pt] (s22) at ([xshift=2em]c21.east) {sublayer2};
\node [anchor=west, draw,circle,dashed,line width=1pt] (c22) at ([xshift=2em]s22.east) {};
\node [anchor=north,fill=red!50,draw,rounded corners=3pt] (s31) at ([yshift=-3em]s21.south) {sublayer1};
\node [anchor=west,draw,circle,line width=1pt] (c31) at ([xshift=2em]s31.east) {};
\node [anchor=north,fill=red!10,draw,dashed,rounded corners=3pt] (s41) at ([yshift=-3em]s31.south) {sublayer1};
\node [anchor=east, draw,circle,line width=1pt] (c44) at ([xshift=-2em]s41.west) {};
\node [anchor=west, draw,circle,dashed,line width=1pt] (c41) at ([xshift=2em]s41.east) {};
\node [anchor=west,fill=red!10,draw,dashed,rounded corners=3pt] (s42) at ([xshift=2em]c41.east) {sublayer2};
\node [anchor=west, draw,circle,dashed,line width=1pt] (c42) at ([xshift=2em]s42.east) {};
\node [anchor=west,fill=red!50,draw,rounded corners=3pt] (s43) at ([xshift=2em]c42.east) {sublayer3};
\node [anchor=west, draw,circle,line width=1pt] (c43) at ([xshift=2em]s43.east) {};
\draw[-,rounded corners,line width=1pt] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em,yshift=2.2em]s11.west) -- ([xshift=2.7em,,yshift=2.2em]s11.east) -- (c11.north);
\draw[-,rounded corners,line width=1pt] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em]s11.west) -- (s11.west);
\draw[-,rounded corners,line width=1pt] (s11.east) -- (c11.west);
\draw[-,rounded corners,line width=1pt] (c11.east) -- ([xshift=11.3em]c11.east) -- (c22.north);
\draw[-,rounded corners,line width=1pt,dashed] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em,yshift=2.2em]s21.west) -- ([xshift=2.7em,,yshift=2.2em]s21.east) -- (c21.north);
\draw[-,rounded corners,line width=1pt,dashed] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em]s21.west) -- (s21.west);
\draw[-,rounded corners,line width=1pt,dashed] (s21.east) -- (c21.west);
\draw[-,rounded corners,line width=1pt,dashed] (c21.east) -- (s22.west);
\draw[-,rounded corners,line width=1pt,dashed] (s22.east) -- (c22.west);
\draw[-,rounded corners,line width=1pt] (c22.east) -- ([xshift=11.3em]c22.east) -- (c43.north);
\draw[-,rounded corners,line width=1pt] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em,yshift=2.2em]s31.west) -- ([xshift=2.7em,,yshift=2.2em]s31.east) -- (c31.north);
\draw[-,rounded corners,line width=1pt] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em]s31.west) -- (s31.west);
\draw[-,rounded corners,line width=1pt] (s31.east) -- (c31.west);
\draw[-,rounded corners,line width=1pt] (c31.east) -- ([xshift=11.3em]c31.east) -- (c42.north);
\draw[-,rounded corners,line width=1pt,dashed] (c44.east) -- ([xshift=0.8em]c44.east) -- ([xshift=-1.2em,yshift=2.2em]s41.west) -- ([xshift=2.7em,,yshift=2.2em]s41.east) -- (c41.north);
\draw[-,rounded corners,line width=1pt,dashed] (c44.east) -- (s41.west);
\draw[-,rounded corners,line width=1pt,dashed] (s41.east) -- (c41.west);
\draw[-,rounded corners,line width=1pt,dashed] (c41.east) -- (s42.west);
\draw[-,rounded corners,line width=1pt,dashed] (s42.east) -- (c42.west);
\draw[-,rounded corners,line width=1pt] (c42.east) -- (s43.west);
\draw[-,rounded corners,line width=1pt] (s43.east) -- (c43.west);
\draw[->,rounded corners,line width=1pt] (c43.east) -- ([xshift=2em]c43.east);
\end{scope}
\end{tikzpicture}
\end{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{word} = [font=\scriptsize]
\tikzstyle{model} = [rectangle,draw,minimum height=3em,minimum width=6em,rounded corners=4pt,fill=red!15!white]
\node [anchor=center] (ate) at (0,0) {};
\node [model,minimum width=10.5em] (decoder) at ([xshift=6em]ate.east) {Decoder};
\node [word] (w1) at ([yshift=-2em,xshift=1em]decoder.south) {$x_3$};
\node [word] (w2) at ([xshift=-1em]w1.west) {\#};
\node [word] (w3) at ([xshift=-1em]w2.west) {\#};
\node [word] (w4) at ([xshift=-1em]w3.west) {\#};
\node [word] (w5) at ([xshift=1em]w1.east) {$x_4$};
\node [word] (w6) at ([xshift=1em]w5.east) {\#};
\node [word] (w7) at ([yshift=2em,xshift=1em]decoder.north) {$x_4$};
\node [word] (w8) at ([yshift=0em,xshift=-1em]w7.west) {$x_3$};
\node [word] (w9) at ([yshift=0em,xshift=1em]w7.east) {$x_5$};
\draw [->] (w1.north) -- ([yshift=1.4em]w1.north);
\draw [->] (w2.north) -- ([yshift=1.3em]w2.north);
\draw [->] (w3.north) -- ([yshift=1.3em]w3.north);
\draw [->] (w4.north) -- ([yshift=1.3em]w4.north);
\draw [->] (w5.north) -- ([yshift=1.4em]w5.north);
\draw [->] (w6.north) -- ([yshift=1.3em]w6.north);
\draw [->] ([yshift=-1.4em]w7.south) -- (w7.south);
\draw [->] ([yshift=-1.4em]w8.south) -- (w8.south);
\draw [->] ([yshift=-1.4em]w9.south) -- (w9.south);
%encoder
\node [model,minimum width=10.5em] (encoder) at ([xshift=-6em]ate.west) {Encoder};
\node [word] (we1) at ([yshift=-2em,xshift=1em]encoder.south) {\#};
\node [word] (we2) at ([xshift=-1em]we1.west) {\#};
\node [word] (we3) at ([xshift=-1em]we2.west) {$x_2$};
\node [word] (we4) at ([xshift=-1em]we3.west) {$x_3$};
\node [word] (we5) at ([xshift=1em]we1.east) {\#};
\node [word] (we6) at ([xshift=1em]we5.east) {$x_6$};
\draw [->] (we1.north) -- ([yshift=1.3em]we1.north);
\draw [->] (we2.north) -- ([yshift=1.3em]we2.north);
\draw [->] (we3.north) -- ([yshift=1.4em]we3.north);
\draw [->] (we4.north) -- ([yshift=1.4em]we4.north);
\draw [->] (we5.north) -- ([yshift=1.3em]we5.north);
\draw [->] (we6.north) -- ([yshift=1.4em]we6.north);
\draw [->,very thick] ([xshift=0.5em]encoder)--([xshift=-0.5em]decoder);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tabular}{l l l}
\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8*0.3,8.4*0.2) {};
\node (h) at (9*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
% y=0.73x + 2.54
\draw [thick,red] (-1*0.3,1.81*0.2) to (10*0.3,9.84*0.2);
\node [font=\footnotesize] at (1.5,-0.5) {欠拟合};
\end{tikzpicture}
&\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8*0.3,8.4*0.2) {};
\node (h) at (9*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
\draw [thick,red] (0.5*0.3,6.15*0.2) to [bend right] (5*0.3,3*0.2) ;
\draw [thick,red] (5*0.3,3*0.2) to [bend right] (8.5*0.3,10*0.2) ;
\node [font=\footnotesize] at (1.5,-0.5) {拟合合适};
\end{tikzpicture}
&\begin{tikzpicture}
\draw[->, thick] (0,0) to (3,0);
\draw[->, thick] (0,-0) to (0,2);
\node (a) at (1*0.3,6*0.2) {};
\node (b) at (2*0.3,4*0.2) {};
\node (c) at (3*0.3,3*0.2) {};
\node (d) at (4*0.3,3*0.2) {};
\node (e) at (6*0.3,4*0.2) {};
\node (f) at (7*0.3,6*0.2) {};
\node (g) at (8.4*0.3,8.4*0.2) {};
\node (h) at (9.4*0.3,9.7*0.2) {};
\fill [black] (a) circle(1pt);
\fill [black] (b) circle(1pt);
\fill [black] (c) circle(1pt);
\fill [black] (d) circle(1pt);
\fill [black] (e) circle(1pt);
\fill [black] (f) circle(1pt);
\fill [black] (g) circle(1pt);
\fill [black] (h) circle(1pt);
%0-a
\draw [thick,red] (0.2*0.3,4*0.2) to [bend left] (1*0.3,6*0.2) ;
% a-b
\draw [thick,red] (1*0.3,6*0.2) to [bend left] (2*0.3,3*0.2) ;
% b-c
\draw [thick,red] (2*0.3,3*0.2) to [bend right] (3*0.3,2.5*0.2) ;
% c-d
\draw [thick,red] (3*0.3,2.5*0.2) to [bend left] (3.5*0.3,4*0.2) ;
\draw [thick,red] (3.5*0.3,4*0.2) to [bend left] (4.3*0.3,2*0.2) ;
\draw [thick,red] (4.3*0.3,2*0.2) to [bend right] (5*0.3,1.5*0.2) ;
% d-e
\draw [thick,red] (5*0.3,1.5*0.2) to [bend right] (6.2*0.3,7*0.2) ;
\draw [thick,red] (6.2*0.3,7*0.2) to [bend right] (6.5*0.3,7*0.2) ;
% e-f
\draw [thick,red] (6.5*0.3,7*0.2) to [bend left] (7*0.3,5*0.2) ;
\draw [thick,red] (7*0.3,5*0.2) to [bend right] (7.5*0.3,4*0.2) ;
\draw [thick,red] (7.5*0.3,4*0.2) to [bend right] (8*0.3,4*0.2) ;
%
% f-g
\draw [thick,red] (8*0.3,4*0.2) to [bend right] (8*0.3,10*0.2) ;
\draw [thick,red] (8*0.3,10*0.2) to [bend left] (8.7*0.3,10*0.2) ;
% g-h
\draw [thick,red] (8.7*0.3,10*0.2) to [bend left] (9.7*0.3,9.4*0.2) ;
\node [font=\footnotesize] at (1.5,-0.5) {过拟合};
\end{tikzpicture} \\
\end{tabular}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (node1) at (-2.9,1) {\small{训练:}};
\node [anchor=center] (node11) at (-2.5,1) {};
\node [anchor=center] (node12) at (-1.7,1) {};
\node [anchor=center] (node2) at (-2.9,0.5) {\small{推理:}};
\node [anchor=center] (node21) at (-2.5,0.5) {};
\node [anchor=center] (node22) at (-1.7,0.5) {};
\node [anchor=west,draw=black,minimum width=5.6em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-1) at (0,0) {\footnotesize{双语数据}};
\node [anchor=south,draw=black,minimum width=4.5em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-2) at ([yshift=-5em]node1-1.south) {\footnotesize{目标语伪数据}};
\node [anchor=west,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node2-1) at ([xshift=-8.8em,yshift=-2.5em]node1-1.west) {\footnotesize{反向NMT系统}};
\node [anchor=west,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node3-1) at ([xshift=3em,yshift=-2.5em]node1-1.east) {\footnotesize{前向NMT系统}};
\draw [-stealth](node1-1.west)--([xshift=3em]node2-1.north);
\draw [-stealth](node1-1.east)--([xshift=-3em]node3-1.north);
\draw [-stealth](node1-2.east)--([xshift=-3em]node3-1.south);
\draw [-stealth](node11.east)--(node12.west);
\draw [-stealth,dashed](node21.east)--(node22.west);
\draw [-stealth,dashed]([xshift=3em]node2-1.south)--(node1-2.west);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{node} = [minimum height=1.0em,draw=teal,fill=teal!10]
\tikzstyle{legend} = [minimum height=1.0em,minimum width=1.0em,draw]
\tikzstyle{node2} = [minimum width=1.0em,minimum height=4.1em,draw=blue,fill=blue!10]
\node[node,minimum width=2.8em] (node1) at (0,0) {};
\node[node,minimum width=4.0em,anchor=north west] (node2) at (node1.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node3) at (node2.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (node4) at (node3.south west) {};
\node[node2,anchor = north west] (grad1) at ([xshift=1.2em]node1.north east) {};
\node[node,minimum width=3.7em,anchor=north west] (node5) at (grad1.north east) {};
\node[node,minimum width=2.8em,anchor=north west] (node6) at (node5.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node7) at (node6.south west) {};
\node[node,minimum width=4.0em,anchor=north west] (node8) at (node7.south west) {};
\node[font=\scriptsize,anchor=east] (line1) at (node1.west) {gpu1};
\node[font=\scriptsize,anchor=east] (line2) at (node2.west) {gpu2};
\node[font=\scriptsize,anchor=east] (line3) at (node3.west) {gpu3};
\node[font=\scriptsize,anchor=east] (line4) at (node4.west) {gpu4};
\node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {};
\draw[->] (-1.4em,-3.62em) -- (9.5em,-3.62em);
\node[node,minimum width=2.8em] (node9) at (15em,0) {};
\node[node,minimum width=4.0em,anchor=north west] (node10) at (node9.south west) {};
\node[node,minimum width=3.2em,anchor=north west] (node11) at (node10.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (node12) at (node11.south west) {};
\node[node,minimum width=3.7em,anchor=north west] (node13) at (node9.north east) {};
\node[node,minimum width=2.8em,anchor=north west] (node14) at (node10.north east) {};
\node[node,minimum width=3.2em,anchor=north west] (node15) at (node11.north east) {};
\node[node,minimum width=4.0em,anchor=north west] (node16) at (node12.north east) {};
\node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {};
\node[font=\scriptsize,anchor=east] (line1) at (node9.west) {gpu1};
\node[font=\scriptsize,anchor=east] (line2) at (node10.west) {gpu2};
\node[font=\scriptsize,anchor=east] (line3) at (node11.west) {gpu3};
\node[font=\scriptsize,anchor=east] (line4) at (node12.west) {gpu4};
\draw[->] (13.6em,-3.62em) -- (22.2em,-3.62em);
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (node9) (node13) (node12) (node16)] (box2) {};
\end{pgfonlayer}
\node[font=\scriptsize,anchor=north] (legend1) at ([xshift=3em]node4.south) {一步一更新};
\node[font=\scriptsize,anchor=north] (legend2) at ([xshift=2.5em]node12.south) {累积两步更新};
\node[font=\scriptsize,anchor=north] (time1) at (grad2.south) {time};
\node[font=\scriptsize,anchor=north] (time1) at (grad3.south) {time};
\node[legend] (legend3) at (2em,2em) {};
\node[font=\scriptsize,anchor=west] (idle) at (legend3.east) {:空闲};
\node[legend,anchor=west,draw=teal,fill=teal!10] (legend4) at ([xshift = 2em]idle.east) {};
\node[font=\scriptsize,anchor=west] (FB) at (legend4.east) {:前向/反向};
\node[legend,anchor=west,draw=blue,fill=blue!10] (legend5) at ([xshift = 2em]FB.east) {};
\node[font=\scriptsize,anchor=west] (grad_sync) at (legend5.east) {:梯度更新};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{snode} = [draw,inner sep=1pt,minimum width=3em,minimum height=0.5em,rounded corners=1pt,fill=green!30!white]
\tikzstyle{pnode} = [draw,inner sep=1pt,minimum width=1em,minimum height=0.5em,rounded corners=1pt]
\node [anchor=west,snode] (s1) at (0,0) {\tiny{}};
\node [anchor=north west,snode,minimum width=6.3em] (s2) at ([yshift=-0.3em]s1.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=2em] (s3) at ([yshift=-0.3em]s2.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=5.5em] (s4) at ([yshift=-0.3em]s3.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=5.8em] (s5) at ([yshift=-0.3em]s4.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=3em] (s6) at ([yshift=-0.3em]s5.south west) {\tiny{}};
\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=0.6em]s1.west) {{句子:}};
\node [anchor=west,pnode,minimum width=3em] (p1) at ([xshift=0.3em]s1.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=4em] (p3) at ([xshift=0.3em]s3.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.5em] (p4) at ([xshift=0.3em]s4.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.2em] (p5) at ([xshift=0.3em]s5.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=3em] (p6) at ([xshift=0.3em]s6.east) {\tiny{}};
\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (s1) (s6) (p1) (p6)] (box0) {};
\node[rectangle,inner sep=0.5em,rounded corners=1pt,draw,fill=blue!15] (model) at ([xshift=4em]box0.east){{模型}};
% big batch
\node [anchor=west,snode] (sbi1) at ([xshift=3em,yshift=6em]model.east) {\tiny{}};
\node [anchor=north west,snode,minimum width=6.3em] (sbi2) at ([yshift=-0.3em]sbi1.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=2em] (sbi3) at ([yshift=-0.3em]sbi2.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=5.5em] (sbi4) at ([yshift=-0.3em]sbi3.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=5.8em] (sbi5) at ([yshift=-0.3em]sbi4.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=3em] (sbi6) at ([yshift=-0.3em]sbi5.south west) {\tiny{}};
\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=-1em]sbi1.west) {{大batch}};
\node [anchor=west,pnode,minimum width=3em] (pbi1) at ([xshift=0.3em]sbi1.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=4em] (pbi3) at ([xshift=0.3em]sbi3.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.5em] (pbi4) at ([xshift=0.3em]sbi4.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.2em] (pbi5) at ([xshift=0.3em]sbi5.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=3em] (pbi6) at ([xshift=0.3em]sbi6.east) {\tiny{}};
\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (sbi1) (sbi6) (pbi1) (pbi6)] (box1) {};
% small batch
\node [anchor=west,snode,minimum width=5.5em] (sma1) at ([xshift=3em,yshift=-3em]model.east) {\tiny{}};
\node [anchor=north west,snode,minimum width=5.8em] (sma2) at ([yshift=-0.3em]sma1.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=6.3em] (sma3) at ([yshift=-0.3em]sma2.south west) {\tiny{}};
\node [anchor=east] (label1) at ([xshift=-0.8em,yshift=-2em]sma1.west) {{小batch}};
\node [anchor=west,pnode,minimum width=0.5em] (pma1) at ([xshift=0.3em]sma1.east) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.2em] (pma2) at ([xshift=0.3em]sma2.east) {\tiny{}};
\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (sma1) (sma3) (pma1) (pma2)] (box2) {};
% small batch
\node [anchor=west,snode,minimum width=2em] (sma4) at ([xshift=4em,yshift=0em]sma1.east) {\tiny{}};
\node [anchor=north west,snode,minimum width=3em] (sma5) at ([yshift=-0.3em]sma4.south west) {\tiny{}};
\node [anchor=north west,snode,minimum width=3em] (sma6) at ([yshift=-0.3em]sma5.south west) {\tiny{}};
\node [anchor=west,pnode,minimum width=0.7em] (pma4) at ([xshift=0.3em]sma4.east) {\tiny{}};
\node [rectangle,inner sep=0.5em,rounded corners=2pt,very thick,dotted,draw=ugreen!80] [fit = (sma4) (sma6) (pma4)] (box3) {};
\draw [->,very thick] (box0.east) -- (model.west);
\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1em]box1.west);
\draw [->,thick] (model.east) .. controls +(east:0.5) and +(west:0.5) .. ([xshift=-1em]box2.west);
\draw [->,very thick] (box2.east) -- (box3.west);
%%%%%
\node [] (t10) at ([yshift=1.5em]box1.north) {$t_1$};
\node [] (t11) at ([yshift=1.5em]box2.north) {$t_1$};
\node [] (t2) at ([yshift=1.5em]box3.north) {$t_2$};
\draw [very thick,decorate,decoration={brace}] ([xshift=0em,yshift=0.3em]box1.north west) to node [midway,name=final] {} ([xshift=0em,yshift=0.3em]box1.north east);
\draw [very thick,decorate,decoration={brace}] ([xshift=0em,yshift=0.3em]box2.north west) to node [midway,name=final] {} ([xshift=0em,yshift=0.3em]box2.north east);
\draw [very thick,decorate,decoration={brace}] ([xshift=0em,yshift=0.3em]box3.north west) to node [midway,name=final] {} ([xshift=0em,yshift=0.3em]box3.north east);
\node [] (m1) at ([xshift=1.5em]box1.east) {$m_1$};
\node [] (m2) at ([xshift=1.5em]box3.east) {$m_2$};
\draw [very thick,decorate,decoration={brace}] ([xshift=3pt]box1.north east) to node [midway,name=final] {} ([xshift=3pt]box1.south east);
\draw [very thick,decorate,decoration={brace}] ([xshift=3pt]box3.north east) to node [midway,name=final] {} ([xshift=3pt]box3.south east);
\node [rectangle,inner sep=0.5em,rounded corners=2pt,draw,fill=red!5,font=\scriptsize] at ([yshift=-2em,xshift=10em]sbi1.east) {
\begin{tabular}{l}
$m$: 显存 \\
$t$: 时间 \\
$m_1>m_2$ \\
$t_1>t_2$
\end{tabular}
};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{node} =[font=\scriptsize]
\tikzstyle{sentence} =[font=\scriptsize,fill=blue!5!white]
\node[sentence] (node1) at (0,0) {[`low', `lower', `newest', `widest']};
\node[sentence,anchor = north] (node2) at ([yshift = -1em]node1.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w e s t $<$e$>$':6, `w i d e s t $<$e$>$':3]};
\node[sentence,anchor = north] (node3) at ([yshift = -1.5em]node2.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red es} t $<$e$>$':6, `w i d {\red es} t $<$e$>$':3]};
\node[sentence,anchor = north] (node4) at ([yshift = -1em]node3.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est} $<$e$>$':6, `w i d {\red est} $<$e$>$':3]};
\node[sentence,anchor = north] (node5) at ([yshift = -1em]node4.south) {[`l o w $<$e$>$':5, `l o w e r $<$e$>$':2, `n e w {\red est$<$e$>$}':6, `w i d {\red est$<$e$>$}':3]};
\node[sentence,anchor = north] (node6) at ([yshift = -1em]node5.south) {$\cdots$};
\node[node,anchor = north] (node7) at ([yshift = -1.6em]node6.south) {直到达到预设的子词词表大小或下一个最高频的字节对出现频率为1。};
\draw[->,line width=.03cm] ([yshift=0em]node1.south) -- ([yshift=0em]node2.north);
\draw[->,line width=.03cm] ([yshift=0em]node3.south) -- ([yshift=0em]node4.north);
\draw[->,line width=.03cm] ([yshift=0em]node4.south) -- ([yshift=0em]node5.north);
\draw[->,line width=.03cm] ([yshift=0em]node5.south) -- ([yshift=0em]node6.north);
\node[node,anchor = west] (node8) at ([xshift = 2em,yshift = 2em]node7.east) {对于词表外的词lowest};
\node[node,anchor = north west] (node9) at ([yshift = 0.3em]node8.south west) {可以被分割为low est};
\node[node,font=\scriptsize,anchor = north,fill=ugreen!5,drop shadow] (dict) at ([xshift = 8em,yshift = -5em]node6.south){\begin{tabular}{llllll}
\multirow{3}{*}{子词词表:} & `es' & `est' & `est$<$e$>$' & `lo' & `low' \\
& `ne' & `new'&`newest$<$e$>$' & `low$<$e$>$'& `wi'\\
& `wid' & `widest$<$e$>$' & `lowe' & `lower'& `lower$<$e$>$'
\end{tabular}};
\node[node,anchor=west] (line1) at ([xshift = 8em]node1.south east) {按字符拆分,并添加};
\node[node,anchor=north west] (line2) at ([yshift=0.3em]line1.south west) {终结符$<$e$>$,统计词频。};
\node[node,anchor=north west] (line3) at ([yshift=-4em]line2.south west) {统计每一个连续字节对};
\node[node,anchor=north west] (line4) at ([yshift=0.3em]line3.south west) {的出现频率,选择最高};
\node[node,anchor=north west] (line5) at ([yshift=0.3em]line4.south west) {频者合并成新的子词};
\begin{pgfonlayer}{background}
%\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=red!10,drop shadow,draw=red] [fit = (line1) (line2) (line3) (line4)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=purple] [fit = (node1) (node2)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,very thick,dotted,draw=teal] [fit = (node3) (node4) (node5) (node6)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow] [fit = (line1) (line2)] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=ugreen!5,drop shadow] [fit = (line3) (line4) (line5)] (box4) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!5,drop shadow] [fit = (node7)] (box5) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=blue!5,drop shadow] [fit = (node8) (node9)] (box6) {};
\end{pgfonlayer}
\draw[->,line width=.03cm] ([yshift=0em]box2.south) -- ([yshift=0.2em]node7.north);
\draw[->,line width=.03cm] ([yshift=0em]box1.south) -- ([yshift=0em]box2.north);
\draw [->,dotted,very thick,purple] (box3.west) -- ([xshift=-1.5em]box3.west);
\draw [->,dotted,very thick,teal] (box4.west) -- ([xshift=-1.7em]box4.west);
\draw [->,dotted,very thick] ([xshift=6em]dict.north) .. controls +(north:1) and +(south:1) .. (box6.south);
\end{tikzpicture}
\ No newline at end of file
%\definecolor{dblue}{cmyk}{0.99998,1,0,0 }
\definecolor{dblue}{cmyk}{100,0,90,0 }
\begin{tikzpicture}[decoration=brace]
\begin{scope}
\setlength{\wseg}{1.5cm}
\setlength{\hseg}{0.6cm}
\setlength{\wnode}{2cm}
\setlength{\hnode}{1.2cm}
\tikzstyle{layernode} = [rectangle,draw,thick,densely dotted,inner sep=3pt,rounded corners,minimum width=2.1\wnode,minimum height=2.7\hnode]
\tikzstyle{attnnode} = [rectangle,draw,inner sep=3pt,rounded corners, minimum width=2\wnode,minimum height=2.2\hnode]
\tikzstyle{thinnode} = [rectangle,inner sep=1pt,rounded corners=1pt,minimum size=0.3\hnode,font=\scriptsize]
\tikzstyle{fatnode} = [rectangle,inner sep=1pt,rounded corners=1pt,minimum height=0.3\hnode,minimum width=\wnode,font=\small]
% 0.3\wseg here can be used to determine the distance between two adjacent blocks
\coordinate (layer00) at (0,0);
\foreach \i / \j in {1/0,2/1,3/2,4/3,5/4}
\coordinate (layer0\i) at ([xshift=2.05\wnode+0.3\wseg]layer0\j);
\node[layernode,anchor=north] (layer11) at ([yshift=-\hseg]layer01.south) {};
\node[attnnode,anchor=south] (attn11) at ([yshift=0.1\hnode]layer11.south) {};
\node[anchor=north west,inner sep=4pt,font=\small] () at (attn11.north west) {Attention};
\node[anchor=south,inner sep=0pt] (out11) at ([yshift=0.3\hseg]attn11.north) {$\cdots$};
\node[thinnode,anchor=south west,thick,draw=dblue,text=black] (q11) at ([xshift=0.1\wseg,yshift=0.2\hseg]attn11.south west) {$Q^n$};
\node[thinnode,anchor=south,thick,draw=orange,text=black] (k11) at ([yshift=0.2\hseg]attn11.south) {$K^n$};
\node[thinnode,anchor=south east,thick,draw=purple,text=black] (v11) at ([xshift=-0.1\wseg,yshift=0.2\hseg]attn11.south east) {$V^n$};
\node[fatnode,anchor=south,thick,draw] (s11) at ([xshift=0.5\wseg,yshift=0.8\hseg]q11.north east) {$S^n\!=\!S(Q^n\!\cdot\!K^n)$};
\node[fatnode,anchor=south,thick,draw] (a11) at ([xshift=0.45\wseg,yshift=1.3\hseg+0.6\hnode]k11.north east) {$A^n\!=\!S^n\!\cdot\!V$};
\begin{scope}[fill=black!100]
\draw[-latex',thick,draw=black!100] (q11.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s11.south);
\draw[-latex',thick,draw=black!100] (k11.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s11.south);
\end{scope}
\begin{scope}[fill=black!100]
\draw[-latex',thick,draw=black!100] (s11.north) .. controls +(north:0.7\hseg) and +(south:0.8\hseg) ..(a11.south);
\draw[-latex',thick,draw=black!100] (v11.north) .. controls +(north:2.7\hseg) and +(south:0.9\hseg) .. (a11.south);
\end{scope}
\draw[-latex',thick] (a11.north).. controls +(north:0.3\hseg) and +(south:0.7\hseg) ..(out11.south);
\node[layernode,anchor=north] (layer12) at ([yshift=-\hseg]layer02.south) {};
\node[attnnode,anchor=south] (attn12) at ([yshift=0.1\hnode]layer12.south) {};
\node[anchor=north west,inner sep=4pt,font=\small] () at (attn12.north west) {Attention};
\node[anchor=south,inner sep=0pt] (out12) at ([yshift=0.3\hseg]attn12.north) {$\cdots$};
\node[thinnode,anchor=south west,thick,draw=dblue!40,text=black!40] (q12) at ([xshift=0.1\wseg,yshift=0.2\hseg]attn12.south west) {$Q^n$};
\node[thinnode,anchor=south,thick,draw=orange!40,text=black!40] (k12) at ([yshift=0.2\hseg]attn12.south) {$K^n$};
\node[thinnode,anchor=south east,thick,draw=purple,text=black] (v12) at ([xshift=-0.1\wseg,yshift=0.2\hseg]attn12.south east) {$V^n$};
\node[fatnode,anchor=south,thick,densely dashed,draw] (s12) at ([xshift=0.5\wseg,yshift=0.8\hseg]q12.north east) {$S^n\!=\!S^m$};
\node[fatnode,anchor=south,thick,draw] (a12) at ([xshift=0.45\wseg,yshift=1.3\hseg+0.6\hnode]k12.north east) {$A^n\!=\!S^n\!\cdot\!V$};
\begin{scope}[fill=black!40]
\draw[-latex',thick,draw=black!40] (q12.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s12.south);
\draw[-latex',thick,draw=black!40] (k12.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s12.south);
\end{scope}
\begin{scope}[fill=black!100]
\draw[-latex',thick,draw=black!100] (s12.north).. controls +(north:0.7\hseg) and +(south:0.8\hseg) .. (a12.south);
\draw[-latex',thick,draw=black!100] (v12.north).. controls +(north:2.7\hseg) and +(south:0.9\hseg) .. (a12.south);
\end{scope}
\draw[-latex',thick] (a12.north).. controls +(north:0.3\hseg) and +(south:0.7\hseg) ..(out12.south);
\node[layernode,anchor=north] (layer13) at ([yshift=-\hseg]layer03.south) {};
\node[attnnode,anchor=south] (attn13) at ([yshift=0.1\hnode]layer13.south) {};
\node[anchor=north west,inner sep=4pt,font=\small] () at (attn13.north west) {Attention};
\node[anchor=south,inner sep=0pt] (out13) at ([yshift=0.3\hseg]attn13.north) {$\cdots$};
\node[thinnode,anchor=south west,thick,draw=dblue!40,text=black!40] (q13) at ([xshift=0.1\wseg,yshift=0.2\hseg]attn13.south west) {$Q^n$};
\node[thinnode,anchor=south,thick,draw=orange!40,text=black!40] (k13) at ([yshift=0.2\hseg]attn13.south) {$K^n$};
\node[thinnode,anchor=south east,thick,draw=purple!40,text=black!40] (v13) at ([xshift=-0.1\wseg,yshift=0.2\hseg]attn13.south east) {$V^n$};
\node[fatnode,anchor=south,thick,draw=black!40,text=black!40] (s13) at ([xshift=0.5\wseg,yshift=0.8\hseg]q13.north east) {$S^n$};
\node[fatnode,anchor=south,thick,densely dashed,draw] (a13) at ([xshift=0.45\wseg,yshift=1.3\hseg+0.6\hnode]k13.north east) {$A^n\!=\!A^m$};
\begin{scope}[fill=black!40]
\draw[-latex',thick,draw=black!40] (q13.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s13.south);
\draw[-latex',thick,draw=black!40] (k13.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s13.south);
\end{scope}
\begin{scope}[fill=black!40]
\draw[-latex',thick,draw=black!40] (s13.north) .. controls +(north:0.7\hseg) and +(south:0.8\hseg) .. (a13.south);
\draw[-latex',thick,draw=black!40] (v13.north) .. controls +(north:2.7\hseg) and +(south:0.9\hseg) .. (a13.south);
\end{scope}
\draw[-latex',thick] (a13.north).. controls +(north:0.3\hseg) and +(south:0.7\hseg) ..(out13.south);
\foreach \i / \j / \k / \q / \s / \t / \v in
{2/1/1/100/100/100/100, 2/2/1/100/100/100/100, 2/3/1/100/100/100/100}
{
\node[layernode,anchor=north] (layer\i\j) at ([yshift=-0.8\hseg]layer\k\j.south) {};
\node[attnnode,anchor=south] (attn\i\j) at ([yshift=0.1\hnode]layer\i\j.south) {};
\node[anchor=north west,inner sep=4pt,font=\small] () at (attn\i\j.north west) {Attention};
\node[anchor=south,inner sep=0pt] (out\i\j) at ([yshift=0.3\hseg]attn\i\j.north) {$\cdots$};
\node[thinnode,anchor=south west,thick,draw=dblue!\q,text=black] (q\i\j) at ([xshift=0.1\wseg,yshift=0.2\hseg]attn\i\j.south west) {$Q^m$};
\node[thinnode,anchor=south,thick,draw=orange!\q,text=black] (k\i\j) at ([yshift=0.2\hseg]attn\i\j.south) {$K^m$};
\node[thinnode,anchor=south east,thick,draw=purple!\s,text=black] (v\i\j) at ([xshift=-0.1\wseg,yshift=0.2\hseg]attn\i\j.south east) {$V^m$};
\node[fatnode,anchor=south,thick,draw=black!\s] (s\i\j) at ([xshift=0.45\wseg,yshift=0.8\hseg]q\i\j.north east) {$S^m\!=\!S(Q^m\!\cdot\!K^m)$};
\node[fatnode,anchor=south,thick,draw=black!80] (a\i\j) at ([xshift=0.45\wseg,yshift=1.3\hseg+0.6\hnode]k\i\j.north east) {$A^m\!=\!S^m\!\cdot\!V$};
\begin{scope}[fill=black!\q]
\draw[-latex',thick,draw=black!\t] (q\i\j.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s\i\j.south);
\draw[-latex',thick,draw=black!\t] (k\i\j.north) .. controls +(north:0.5\hseg) and +(south:0.8\hseg) .. (s\i\j.south);
\end{scope}
\begin{scope}[fill=black!\s]
\draw[-latex',thick,draw=black!\v] (s\i\j.north).. controls +(north:0.7\hseg) and +(south:0.8\hseg) ..(a\i\j.south);
\draw[-latex',thick,draw=black!\v] (v\i\j.north).. controls +(north:2.7\hseg) and +(south:0.9\hseg) ..(a\i\j.south);
\end{scope}
\draw[-latex',thick] (a\i\j.north).. controls +(north:0.3\hseg) and +(south:0.7\hseg) ..(out\i\j.south);
}
\draw[-latex',densely dashed,very thick] (s22.west) to [out=120,in=-120] (s12.west);
\draw[-latex',densely dashed,very thick] (a23.east) to [out=60,in=-60] (a13.east);
\foreach \i in {1,2,3}
{
\node[anchor=north west,inner sep=3pt,font=\tiny] () at ([yshift=-0.2em]layer1\i.north west) {Layer $n\!=\!m\!+\!i$};
\node[anchor=north west,inner sep=3pt,font=\tiny] () at ([yshift=-0.2em]layer2\i.north west) {Layer $m$};
\node[anchor=center,inner sep=1pt] (dot1\i) at ([yshift=0.5\hseg]layer1\i.north) {$\cdots$};
\draw[->,thick] (out1\i.north) -- ([yshift=0.1em]dot1\i.south);
\node[anchor=center,inner sep=1pt] (dot2\i) at ([yshift=-0.4\hseg]layer1\i.south) {$\cdots$};
\draw[->,thick] ([yshift=-0.15em]dot2\i.north) -- ([yshift=-0.3em]attn1\i.south);
\draw[->,thick] (out2\i.north) -- ([yshift=0.1em]dot2\i.south);
\node[anchor=center,inner sep=1pt] (dot3\i) at ([yshift=-0.4\hseg]layer2\i.south) {$\cdots$};
\draw[->,thick] ([yshift=-0.15em]dot3\i.north) -- ([yshift=-0.3em]attn2\i.south);
}
\node[anchor=north,align=left,inner sep=1pt,font=\footnotesize] () at (dot31.south) {(a) Standard Transformer Attention};
\node[anchor=north,align=left,inner sep=1pt,font=\footnotesize] () at (dot32.south) {(b) \textsc{San} Self-Attention};
\node[anchor=north,align=left,inner sep=1pt,font=\footnotesize] () at (dot33.south) {(c) \textsc{San} Encoder-Decoder Attention};
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}\large
\node [anchor=north] (n1) at (0, 0) {Lowcase\ \ :};
\node [anchor=west] (n2) at ([xshift=1.1em,yshift=1.7em]n1.east) {What\ \ is\ \ the\ \ WTO\ ?};
\node [anchor=west] (n3) at ([xshift=0em,yshift=-1.7em]n1.west) {Truecase\ \ :};
\node [anchor=west] (n4) at ([xshift=0em,yshift=-1.7em]n2.west) {\,what\ \ is\ \ the\ \ \ wto\ \ \ ?};
\node [anchor=west] (n5) at ([xshift=0em,yshift=-1.7em]n4.west) {\,what\ \ is\ \ the\ \ WTO\ ?};
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\def\neuronsep{1.5}
\def\nodespace{1}
\def\picturespace{0.8}
\tikzstyle{neuronnode} = [minimum size=1.8em,circle,draw,very thick,ublue,inner sep=0pt, fill=white,align=center]
%standard
\node [neuronnode] (neuron_b) at (0,0) {\scriptsize{$b_{i}^{l}$}};
\node [neuronnode] (neuron_y3) at (0,-1*\neuronsep) {\scriptsize{$x_{3}^{l}$}};
\node [neuronnode] (neuron_y2) at (0,-2*\neuronsep) {\scriptsize{$x_{2}^{l}$}};
\node [neuronnode] (neuron_y1) at (0,-3*\neuronsep) {\scriptsize{$x_{1}^{l}$}};
\node [neuronnode] (neuron_z) at (1.2 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$z_{i}^{l+1}$}};
\node [neuronnode] (neuron_y') at (2.4 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$x_{i}^{l+1}$}};
\node [anchor=north,ublue] (standard) at ([yshift=-4em]neuron_z.south) {\scriptsize{standard}};
\node [ublue] (standard) at ([xshift=-1em]neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
\node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]neuron_z.east) {\scriptsize{$f$}};
\draw [->,line width=0.3mm] (neuron_b.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y3.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y2.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_y1.east) -- (neuron_z.west);
\draw [->,line width=0.3mm] (neuron_z.east) -- (neuron_y'.west);
%dropout
\node [neuronnode] (drop_neuron_b) at (5*\nodespace,0) {\scriptsize{$b_{i}^{l}$}};
\node [neuronnode] (drop_neuron_y3') at (5*\nodespace,-1*\neuronsep) {\scriptsize{$\tilde{x}_{3}^{l}$}};
\node [neuronnode] (drop_neuron_y2') at (5*\nodespace,-2*\neuronsep) {\scriptsize{$\tilde{x}_{2}^{l}$}};
\node [neuronnode] (drop_neuron_y1') at (5*\nodespace,-3*\neuronsep) {\scriptsize{$\tilde{x}_{1}^{l}$}};
\node [neuronnode] (drop_neuron_z) at (6.2 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$z_{i}^{l+1}$}};
\node [neuronnode] (drop_neuron_y') at (7.4 * \nodespace,-1.5 * \neuronsep) {\scriptsize{$x_{i}^{l+1}$}};
\node [neuronnode] (drop_neuron_y3) at (3.8*\nodespace,-1*\neuronsep) {\scriptsize{$x_{3}^{l}$}};
\node [neuronnode] (drop_neuron_y2) at (3.8*\nodespace,-2*\neuronsep) {\scriptsize{$x_{2}^{l}$}};
\node [neuronnode] (drop_neuron_y1) at (3.8*\nodespace,-3*\neuronsep) {\scriptsize{$x_{1}^{l}$}};
\node [neuronnode] (drop_neuron_r3) at (4.4*\nodespace,-0.5*\neuronsep) {\scriptsize{$r_{3}^{l}$}};
\node [neuronnode] (drop_neuron_r2) at (4.4*\nodespace,-1.5*\neuronsep) {\scriptsize{$r_{2}^{l}$}};
\node [neuronnode] (drop_neuron_r1) at (4.4*\nodespace,-2.5*\neuronsep) {\scriptsize{$r_{1}^{l}$}};
\node [anchor=north,ublue] (standard) at ([yshift=-4em]drop_neuron_z.south) {\scriptsize{dropout}};
\node [ublue] (standard) at ([xshift=-1em]drop_neuron_z.west) {\scriptsize{$\mathbf{w}_{i}^{l}$}};
\node [ublue] (standard) at ([xshift=0.6em,yshift=0.3em]drop_neuron_z.east) {\scriptsize{$f$}};
%structure
\draw [->,line width=0.3mm] (drop_neuron_b.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y3'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y2'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_y1'.east) -- (drop_neuron_z.west);
\draw [->,line width=0.3mm] (drop_neuron_z.east) -- (drop_neuron_y'.west);
%r
\draw [->,line width=0.3mm] (drop_neuron_y3.east) -- (drop_neuron_y3'.west);
\draw [->,line width=0.3mm] (drop_neuron_y2.east) -- (drop_neuron_y2'.west);
\draw [->,line width=0.3mm] (drop_neuron_y1.east) -- (drop_neuron_y1'.west);
\draw [-,line width=0.3mm] (drop_neuron_r3.south) -- ([yshift=-1em]drop_neuron_r3.south);
\draw [-,line width=0.3mm] (drop_neuron_r2.south) -- ([yshift=-1em]drop_neuron_r2.south);
\draw [-,line width=0.3mm] (drop_neuron_r1.south) -- ([yshift=-1em]drop_neuron_r1.south);
%equ
\node [anchor=west,inner sep = 2pt] (line1) at (9*\nodespace,0) {未应用dropout:};
\node [anchor=north west,inner sep = 2pt] (line2) at (line1.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \mathbf{x}+b_{i}^{l}$};
\node [anchor=north west,inner sep = 2pt] (line3) at (line2.south west) {$x_{i}^{l+1}=f\left(x_{i}^{l}\right)$};
\node [anchor=north west,inner sep = 2pt] (line4) at (line3.south west) {应用dropout:};
\node [anchor=north west,inner sep = 2pt] (line5) at (line4.south west) {$r_{j}^{l} \sim$ Bernoulli $(1-p)$};
\node [anchor=north west,inner sep = 2pt] (line6) at (line5.south west) {$\tilde{\mathbf{x}}=\mathbf{r} * \mathbf{x}$};
\node [anchor=north west,inner sep = 2pt] (line7) at (line6.south west) {$z_{i}^{l+1}=\mathbf{w}_{i}^{l} \widetilde{\mathbf{x}}+b_{i}^{l}$};
\node [anchor=north west,inner sep = 2pt] (line8) at (line7.south west) {$x_{i}^{l+1}=f\left(z_{i}^{l}\right)$};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
{\small
\node [anchor=north,rectangle,draw, inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt,thick,fill=blue!10!white] (n1) at (0, 0) {数据处理};
\node [anchor=west,rectangle,draw, inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt,thick,fill=yellow!10!white] (n2) at ([xshift=3em,yshift=0em]n1.east) {训练};
\node [anchor=south,rectangle,draw, inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt,thick,fill=red!10!white] (n3) at ([xshift=0em,yshift=2em]n2.north) {架构设计};
\node [anchor=north,rectangle,draw, inner sep=0mm,minimum height=3em,minimum width=6em,rounded corners=5pt,thick,fill=green!10!white] (n4) at ([xshift=0em,yshift=-2em]n2.south) {推断};
}
\draw [-,very thick] ([xshift=0em,yshift=0em]n1.south) -- ([xshift=0em,yshift=-3.25em]n1.south);
\draw [->,very thick] ([xshift=-5.5em,yshift=0em]n4.west) -- ([xshift=0em,yshift=0em]n4.west);
\draw [->,very thick] ([xshift=0em,yshift=0em]n1.east) -- ([xshift=0em,yshift=0em]n2.west);
\draw [->,very thick] ([xshift=0em,yshift=0em]n3.south) -- ([xshift=0em,yshift=0em]n2.north);
\draw [->,very thick] ([xshift=0em,yshift=0em]n2.south) -- ([xshift=0em,yshift=0em]n4.north);
{\footnotesize
\node [anchor=west] (n11) at ([xshift=-13em,yshift=2em]n1.west) {对训练和测试数据进行};
\node [anchor=west] (n12) at ([xshift=0em,yshift=-1.5em]n11.west) {处理,包括:数据清洗、};
\node [anchor=west] (n13) at ([xshift=0em,yshift=-1.5em]n12.west) {翻译单元切分、译文后};
\node [anchor=west] (n14) at ([xshift=0em,yshift=-1.5em]n13.west) {处理等};
\node [anchor=west] (n31) at ([xshift=2em,yshift=0em]n3.north east) {神经网络模型设计,包括};
\node [anchor=west] (n32) at ([xshift=0em,yshift=-1.5em]n31.west) {编码器、解码器、注意力};
\node [anchor=west] (n33) at ([xshift=0em,yshift=-1.5em]n32.west) {机制的设计};
\node [anchor=west] (n21) at ([xshift=0em,yshift=-2em]n33.south west) {在训练数据上优化模型参};
\node [anchor=west] (n22) at ([xshift=0em,yshift=-1.5em]n21.west) {数,包括训练的策略、损};
\node [anchor=west] (n23) at ([xshift=0em,yshift=-1.5em]n22.west) {失函数设计、超参数的调};
\node [anchor=west] (n24) at ([xshift=0em,yshift=-1.5em]n23.west) {};
\node [anchor=west] (n41) at ([xshift=0em,yshift=-2em]n24.south west) {使用训练好的模型在新的};
\node [anchor=west] (n42) at ([xshift=0em,yshift=-1.5em]n41.west) {数据上进行翻译,包括解};
\node [anchor=west] (n43) at ([xshift=0em,yshift=-1.5em]n42.west) {码策略的选择、压缩、优};
\node [anchor=west] (n44) at ([xshift=0em,yshift=-1.5em]n43.west) {化等};
}
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,rounded corners=1pt,thick,draw,fill=red!5!white] [fit = (n31) (n32) (n33)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,thick,draw,fill=yellow!5!white] [fit = (n21) (n22) (n23) (n24) ] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,thick,draw,fill=green!5!white] [fit = (n41) (n42) (n43) (n44) ] (box3) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,thick,draw,fill=blue!5!white] [fit = (n11) (n12) (n13) (n14) ] (box4) {};
\end{pgfonlayer}
\draw [->,dotted,very thick,red] (n3.east) -- ([xshift=1.4em]n3.east);
\draw [->,dotted,very thick] (n2.east) -- ([xshift=1.4em]n2.east);
\draw [->,dotted,very thick,ugreen] (n4.east) -- ([xshift=1.4em]n4.east);
\draw [->,dotted,very thick,blue] (n1.west) -- ([xshift=-1.4em]n1.west);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\node [rectangle,inner sep=2pt,font=\scriptsize] (center) at (0,0) {
\begin{tabular}{c}
Feedback signals during the loop:\\
$R(x,f,g)=s(x,x')$:BLEU of $x'$ given $x$ \\
$L(y)$ and $L(x')$:Language model of $y$ and $x'$
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (top) at ([yshift=3em,xshift=0em]center.north) {
\begin{tabular}{c}
En$->$Ch translation \\
Primal Task $f:x\rightarrow y$
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (left) at ([yshift=0em,xshift=-3em]center.west) {
\begin{tabular}{c}
English sentence $x$ \\
New English sentence \\
$x' = g(y)$
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (right) at ([yshift=0em,xshift=3em]center.east) {
\begin{tabular}{c}
Chinese sentence \\
$y= f(y) $
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (down) at ([yshift=-3em,xshift=0em]center.south) {
Dual Task $g:y\rightarrow x$
};
\node [rectangle,inner sep=2pt,draw,thick,fill=green!20] (agent1) at ([xshift=-1em]left.west) {Agent};
\node [rectangle,inner sep=2pt,draw,thick,fill=blue!20] (agent2) at ([xshift=1em]right.east) {Agent};
\draw [-,line width=0.8pt] (left.north) .. controls +(north:0.8) and +(west:0.8) .. (top.west);
\draw [->,line width=0.8pt] (top.east) .. controls +(east:0.8) and +(north:0.8) .. (right.north);
\draw [->,line width=0.8pt] (down.west) .. controls +(west:0.8) and +(south:0.8) .. (left.south);
\draw [-,line width=0.8pt] (right.south) .. controls +(south:0.8) and +(east:0.8) .. (down.east) ;
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\node [rectangle,inner sep=2pt,font=\scriptsize] (center) at (0,0) {};
\node [rectangle,inner sep=2pt,font=\scriptsize] (top) at ([yshift=3em,xshift=0em]center.north) {
\begin{tabular}{c}
翻译模型 \\
$\textrm{P}(\mathbf t|\mathbf s)$
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (left) at ([yshift=0em,xshift=-4em]center.west) {
\begin{tabular}{c}
今天天气真好。
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (right) at ([yshift=0em,xshift=4em]center.east) {
\begin{tabular}{c}
The weather is \\so good today.
\end{tabular}
};
\node [rectangle,inner sep=2pt,font=\scriptsize] (down) at ([yshift=-3em,xshift=0em]center.south) {
\begin{tabular}{c}
翻译模型 \\
$\textrm{P}(\mathbf s|\mathbf t)$
\end{tabular}
};
\draw [->,line width=0.8pt] (left.north) .. controls +(north:0.5) and +(west:0.5) .. (top.west);
\draw [->,line width=0.8pt] (top.east) .. controls +(east:0.5) and +(north:0.5) .. (right.north);
\draw [->,line width=0.8pt] (down.west) .. controls +(west:0.5) and +(south:0.5) .. (left.south);
\draw [->,line width=0.8pt] (right.south) .. controls +(south:0.5) and +(east:0.5) .. (down.east) ;
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\setlength{\base}{1.2em}
\node[minimum width=8em,minimum height=18em,inner sep=1pt,rounded corners=10pt,draw,thick,font=\scriptsize,fill=white!50,drop shadow,align=center] (word1) at (0,0){
\begin{tabular}{l l}
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\scriptsize,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node,fill=green!15] (a1) at (0,0) {天气 \ \ \ \ };
\node [node] (a2) at (0,-\base*1) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node,fill=green!15] (a3) at (0,-\base*2) {\textless{}p\textgreater \ \ 显示 \ \ 所选 \ \ \ \ \ \ \ \ \textless{}\textbackslash{}p\textgreater{} };
\node [node,fill=blue!15] (a41) at (0,-\base*3) {桃树 \ \ \ \ 杏树 \ \ \ \ 梨树 \ \ , \ \ \ \ 不让 \ \ \ \ , \ \ };
\node [node,fill=blue!15] (a42) at (0,-\base*4) {\ \ 不让 \ \ \ \ , \ \ \ \ 开满 \ \ \ \ \ \ };
\node [node] (a5) at (0,-\base*5.5) {机器 \ \ 翻译 \ \ \ \ 人们 \ \ \ \ 生活 \ \ 带来了 \ \ 便利 \ \ };
\node [node] (a6) at (0,-\base*7) {这件 \ \ 事情 \ \ \ \ 成功率 \ \ \ \ 50 \ \ $\%$ \ \ };
\node [node,fill=green!15] (a7) at (0,-\base*8) {翻译 \ \ \ \ \ \ 特别 \ \ 感兴趣 \ \ };
\node [node] (a8) at (0,-\base*9) {他说 \ \ : \ \ `` 这个 \ \ 深深 \ \ 有趣 \ \ \ \ 想法 \ \ \ \ };
\node [node] (a81) at (0,-\base*10) {\ \ 心里 \ \ 。'' };
\node [node] (a9) at (0,-\base*11) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node,fill=yellow!15] (a10) at (0,-\base*12) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a11) at (0,-\base*13) {花下 \ \ 成千成百 \ \ \ \ 蜜蜂 \ \ 嗡嗡 \ \ \ \ 闹着 \ \ };
\end{tikzpicture}
&
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\scriptsize,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node,fill=green!15] (a1) at (0,0) {The weather today is good , but ... };
\node [node] (a2) at (0,-\base*1) {I like rainy days .};
\node [node,fill=green!15] (a3) at (0,-\base*2) {\textless{}p\textgreater to show the selected side . \textless{}\textbackslash{}p\textgreater{}};
\node [node,fill=blue!15] (a4) at (0,-\base*3.5) {Flowers bloom .};
\node [node] (a51) at (0,-\base*5) {Machine translation brings convenience to people's };
\node [node] (a52) at (0,-\base*6) {lives. };
\node [node] (a6) at (0,-\base*7) {The success rate for this matter is $\%$ . };
\node [node,fill=green!15] (a7) at (0,-\base*8) {I'm interested in translation . };
\node [node] (a8) at (0,-\base*9) {He said: `` This interesting idea is deeply };
\node [node] (a81) at (0,-\base*10) {imprinted in my heart . '' };
\node [node] (a9) at (0,-\base*11) {I like rainy days .};
\node [node,fill=yellow!15] (a10) at (0,-\base*12) {I like rainy days .};
\node [node] (a11) at (0,-\base*13) {Hundreds of bees hummed under the flowers . };
\end{tikzpicture}
\end{tabular}
};
\node[minimum width=8em,minimum height=10.5em,inner sep=2pt,rounded corners=10pt,draw,thick,font=\scriptsize,fill=white!50,drop shadow,align=center] (word2) at (0,-6.6){
\begin{tabular}{l l}
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\scriptsize,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a2) at (0,-\base*1.5) {机器 \ \ 翻译 \ \ \ \ 人们 \ \ \ \ 生活 \ \ 带来了 \ \ 便利 \ \ };
\node [node] (a3) at (0,-\base*3) {这件 \ \ 事情 \ \ \ \ 成功率 \ \ \ \ 50 \ \ $\%$ \ \ };
\node [node] (a4) at (0,-\base*4) {他说 \ \ : \ \ `` 这个 \ \ 深深 \ \ 有趣 \ \ \ \ 想法 \ \ \ \ };
\node [node] (a42) at (0,-\base*5) {\ \ 心里 \ \ 。'' };
\node [node] (a5) at (0,-\base*6) {\ \ 喜欢 \ \ 下雨 \ \ \ \ };
\node [node] (a6) at (0,-\base*7) {花下 \ \ 成千成百 \ \ \ \ 蜜蜂 \ \ 嗡嗡 \ \ \ \ 闹着 \ \ };
\end{tikzpicture}
&
\begin{tikzpicture}
\tikzstyle{node} = [minimum width=0em,minimum height=1em,inner sep=2pt,font=\scriptsize,anchor = west,rounded corners=0pt,outer sep=0pt]
\node [node] (a1) at (0,0) {I like rainy days .};
\node [node] (a2) at (0,-\base*1) {Machine translation brings convenience to people's };
\node [node] (a22) at (0,-\base*2) {lives. };
\node [node] (a3) at (0,-\base*3) {The success rate for this matter is $\%$ . };
\node [node] (a4) at (0,-\base*4) {He said: `` This interesting idea is deeply };
\node [node] (a42) at (0,-\base*5) {imprinted in my heart . '' };
\node [node] (a5) at (0,-\base*6) {I like rainy days .};
\node [node] (a6) at (0,-\base*7) {Hundreds of bees hummed under the flowers . };
\end{tikzpicture}
\end{tabular}
};
\draw[->,line width=.1cm,blue!40 ] ([yshift=-0.3\base]word1.south) -- ([yshift=0.3\base]word2.north);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}[scale=1]
\tikzstyle{prob} = [rectangle,fill=blue!40,text=white,inner sep=0pt,font=\scriptsize];
\tikzstyle{word} = [inner sep=0pt,font=\small];
\begin{scope}[]
% Column 1
\node [prob,minimum size=0.1cm] (prob11) at (0,0) {};
\node [prob,minimum size=0.5cm,anchor=center] (prob21) at ([yshift=-0.5cm]prob11.center) {$.7$};
\node [prob,minimum size=0.1cm,anchor=center] (prob31) at ([yshift=-0.5cm]prob21.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob41) at ([yshift=-0.5cm]prob31.center) {};
\node [prob,minimum size=0.3cm,anchor=center] (prob51) at ([yshift=-0.5cm]prob41.center) {$.2$};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob51.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob11.center);
\node [draw,fit=(prob11) (prob21) (prob31) (prob41) (prob51) (topright) (bottomleft)] (prob1) {};
\end{pgfonlayer}
% Column 2
\node [prob,minimum size=0.1cm,anchor=center] (prob12) at ([xshift=1cm]prob11.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob22) at ([yshift=-0.5cm]prob12.center) {};
\node [prob,minimum size=0.4cm,anchor=center] (prob32) at ([yshift=-0.5cm]prob22.center) {$.4$};
\node [prob,minimum size=0.3cm,anchor=center] (prob42) at ([yshift=-0.5cm]prob32.center) {$.3$};
\node [prob,minimum size=0.1cm,anchor=center] (prob52) at ([yshift=-0.5cm]prob42.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob52.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob12.center);
\node [draw,fit=(prob12) (prob22) (prob32) (prob42) (prob52) (topright) (bottomleft)] (prob2) {};
\end{pgfonlayer}
% Column 3
\node [prob,minimum size=0.1cm,anchor=center] (prob13) at ([xshift=1cm]prob12.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob23) at ([yshift=-0.5cm]prob13.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob33) at ([yshift=-0.5cm]prob23.center) {};
\node [prob,minimum size=0.4cm,anchor=center] (prob43) at ([yshift=-0.5cm]prob33.center) {$.6$};
\node [prob,minimum size=0.1cm,anchor=center] (prob53) at ([yshift=-0.5cm]prob43.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob53.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob13.center);
\node [draw,fit=(prob13) (prob23) (prob33) (prob43) (prob53) (topright) (bottomleft)] (prob3) {};
\end{pgfonlayer}
% Column 4
\node [prob,minimum size=0.5cm,anchor=center] (prob14) at ([xshift=1cm]prob13.center) {$.8$};
\node [prob,minimum size=0.1cm,anchor=center] (prob24) at ([yshift=-0.5cm]prob14.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob34) at ([yshift=-0.5cm]prob24.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob44) at ([yshift=-0.5cm]prob34.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob54) at ([yshift=-0.5cm]prob44.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob54.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob14.center);
\node [draw,fit=(prob14) (prob24) (prob34) (prob44) (prob54) (topright) (bottomleft)] (prob4) {};
\end{pgfonlayer}
% Label
\draw [decorate,decoration={brace}] ([yshift=0.1cm]prob1.north west) to node [midway,above,font=\small] {学习目标(Teacher输出)} ([yshift=0.1cm]prob4.north east);
% Vocab
\node [word,anchor=center] () at ([xshift=-0.9cm]prob11.center) {EOS};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob21.center) {I};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob31.center) {am};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob41.center) {fine};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob51.center) {good};
\draw [decorate,decoration={brace,mirror}] ([xshift=-1cm]prob1.north west) to node [midway,left,font=\small,align=center] {\\} ([xshift=-1cm]prob1.south west);
% Model
\coordinate (bottomleft) at ([yshift=-1cm]prob1.south west);
\coordinate (topright) at ([yshift=-0.5cm]prob4.south east);
\node [draw,rounded corners=3pt,fill=green!20,inner sep=0pt,fit=(bottomleft) (topright)] (model) {};
\node [word] () at (model.center) {Student};
\foreach \i in {1,2,...,4}
\draw [-latex,thick] (prob\i.south) to ([yshift=-0.5cm]prob\i.south);
% Input
\node [word,anchor=south] (input1) at ([yshift=-1.8cm]prob1.south) {EOS};
\node [word,anchor=south] (input2) at ([yshift=-1.8cm]prob2.south) {I};
\node [word,anchor=south] (input3) at ([yshift=-1.8cm]prob3.south) {am};
\node [word,anchor=south] (input4) at ([yshift=-1.8cm]prob4.south) {fine};
\foreach \i in {1,2,...,4}
\draw [-latex,thick] ([yshift=0.3cm]input\i.south) to ([yshift=0.8cm]input\i.south);
\node [word,anchor=south] (ns) at ([xshift=-1cm]input1.south) {输入:};
\node [word,anchor=north] () at ([xshift=2.1cm,yshift=-0.5cm]ns.south) {(a)\ Word-level};
\end{scope}
\begin{scope}[xshift=2.5in]
% Column 1
\node [prob,minimum size=0.1cm] (prob11) at (0,0) {};
\node [prob,minimum size=0.5cm,anchor=center] (prob21) at ([yshift=-0.5cm]prob11.center) {$1.$};
\node [prob,minimum size=0.1cm,anchor=center] (prob31) at ([yshift=-0.5cm]prob21.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob41) at ([yshift=-0.5cm]prob31.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob51) at ([yshift=-0.5cm]prob41.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob51.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob11.center);
\node [draw,fit=(prob11) (prob21) (prob31) (prob41) (prob51) (topright) (bottomleft)] (prob1) {};
\end{pgfonlayer}
% Column 2
\node [prob,minimum size=0.1cm,anchor=center] (prob12) at ([xshift=1cm]prob11.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob22) at ([yshift=-0.5cm]prob12.center) {};
\node [prob,minimum size=0.5cm,anchor=center] (prob32) at ([yshift=-0.5cm]prob22.center) {$1.$};
\node [prob,minimum size=0.1cm,anchor=center] (prob42) at ([yshift=-0.5cm]prob32.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob52) at ([yshift=-0.5cm]prob42.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob52.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob12.center);
\node [draw,fit=(prob12) (prob22) (prob32) (prob42) (prob52) (topright) (bottomleft)] (prob2) {};
\end{pgfonlayer}
% Column 3
\node [prob,minimum size=0.1cm,anchor=center] (prob13) at ([xshift=1cm]prob12.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob23) at ([yshift=-0.5cm]prob13.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob33) at ([yshift=-0.5cm]prob23.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob43) at ([yshift=-0.5cm]prob33.center) {};
\node [prob,minimum size=0.5cm,anchor=center] (prob53) at ([yshift=-0.5cm]prob43.center) {$1.$};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob53.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob13.center);
\node [draw,fit=(prob13) (prob23) (prob33) (prob43) (prob53) (topright) (bottomleft)] (prob3) {};
\end{pgfonlayer}
% Column 4
\node [prob,minimum size=0.5cm,anchor=center] (prob14) at ([xshift=1cm]prob13.center) {$1.$};
\node [prob,minimum size=0.1cm,anchor=center] (prob24) at ([yshift=-0.5cm]prob14.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob34) at ([yshift=-0.5cm]prob24.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob44) at ([yshift=-0.5cm]prob34.center) {};
\node [prob,minimum size=0.1cm,anchor=center] (prob54) at ([yshift=-0.5cm]prob44.center) {};
\begin{pgfonlayer}{background}
\coordinate (bottomleft) at ([shift={(-0.25cm,-0.25cm)}]prob54.center);
\coordinate (topright) at ([shift={(0.25cm,0.25cm)}]prob14.center);
\node [draw,fit=(prob14) (prob24) (prob34) (prob44) (prob54) (topright) (bottomleft)] (prob4) {};
\end{pgfonlayer}
% Label
\draw [decorate,decoration={brace}] ([yshift=0.1cm]prob1.north west) to node [midway,above,font=\small] {学习目标(Teacher输出)} ([yshift=0.1cm]prob4.north east);
% Vocab
\node [word,anchor=center] () at ([xshift=-0.9cm]prob11.center) {EOS};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob21.center) {I};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob31.center) {am};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob41.center) {fine};
\node [word,anchor=center] () at ([xshift=-0.9cm]prob51.center) {good};
\draw [decorate,decoration={brace,mirror}] ([xshift=-1cm]prob1.north west) to node [midway,left,font=\small,align=center] {\\} ([xshift=-1cm]prob1.south west);
% Model
\coordinate (bottomleft) at ([yshift=-1cm]prob1.south west);
\coordinate (topright) at ([yshift=-0.5cm]prob4.south east);
\node [draw,rounded corners=3pt,fill=green!20,inner sep=0pt,fit=(bottomleft) (topright)] (model) {};
\node [word] () at (model.center) {Student};
\foreach \i in {1,2,...,4}
\draw [-latex,thick] (prob\i.south) to ([yshift=-0.5cm]prob\i.south);
% Input
\node [word,anchor=south] (input1) at ([yshift=-1.8cm]prob1.south) {EOS};
\node [word,anchor=south] (input2) at ([yshift=-1.8cm]prob2.south) {I};
\node [word,anchor=south] (input3) at ([yshift=-1.8cm]prob3.south) {am};
\node [word,anchor=center] (input4) at ([xshift=1cm]input3.center) {good};
\foreach \i in {1,2,3}
\draw [-latex,thick] ([yshift=0.3cm]input\i.south) to ([yshift=0.8cm]input\i.south);
\draw [-latex,thick] ([yshift=0.36cm]input4.south) to ([yshift=0.86cm]input4.south);
\node [word,anchor=south] (ns) at ([xshift=-1cm]input1.south) {输入:};
\node [word,anchor=north] () at ([xshift=2.1cm,yshift=-0.5cm]ns.south) {(b)\ Sequence-level };
\end{scope}
\end{tikzpicture}
\centering
\hspace*{\fill}
\subfigure[假设选择]
{
\begin{tikzpicture}[scale=0.5]
\tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
\tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
\begin{scope}
\node [system,draw=orange,text=orange] (model3) at (0,0) {Model $3$};
\node [system,draw=ugreen,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {Model $2$};
\node [system,draw=red,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {Model $1$};
\node [output,draw=orange,text=orange,anchor=west] (output3) at ([xshift=0.5cm]model3.east) {Output $3$};
\node [output,draw=ugreen,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]model2.east) {Output $2$};
\node [output,draw=red,text=red,anchor=west] (output1) at ([xshift=0.5cm]model1.east) {Output $1$};
\begin{pgfonlayer}{background}
\node [draw,thick,dashed,rounded corners=3pt,inner sep=2pt,fit=(output1) (output2) (output3)] (output) {};
\end{pgfonlayer}
\node [output,draw=ublue,text=ublue,minimum width=1cm,right=1cm of output] (final) {Final\\Output};
\draw [->,very thick] (model1) to (output1);
\draw [->,very thick] (model2) to (output2);
\draw [->,very thick] (model3) to (output3);
\draw [->,very thick] (output) to node [above,pos=0.5,font=\tiny] {Selection} (final);
\end{scope}
\end{tikzpicture}
}
\hfill
\subfigure[预测融合]
{
\begin{tikzpicture}[scale=0.5]
\tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
\tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
\begin{scope}
\node [system,draw=orange,text=orange] (model3) at (0,0) {Model $3$};
\node [system,draw=ugreen,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {Model $2$};
\node [system,draw=red,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {Model $1$};
\begin{pgfonlayer}{background}
\node [draw,thick,dashed,inner sep=2pt,fit=(model3) (model2) (model1)] (ensemble) {};
\end{pgfonlayer}
\node [system,draw=ugreen,text=ugreen,right=1cm of ensemble] (model) {Model};
\node [output,draw=ublue,text=ublue,minimum width=1cm,anchor=west] (final) at ([xshift=0.5cm]model.east) {Final\\Output};
\draw [->,very thick] (ensemble) to node [above,pos=0.5,font=\tiny] {Ensemble} (model);
\draw [->,very thick] (model) to (final);
\end{scope}
\end{tikzpicture}
}
\hspace*{\fill}
\\
\subfigure[译文重组]
{
\begin{tikzpicture}[scale=0.5]
\tikzstyle{system} = [rectangle,very thick,minimum width=1cm,font=\tiny];
\tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1cm,align=center,font=\tiny];
\tikzstyle{dot} = [circle,fill=blue!40!white,minimum size=5pt,inner sep=0pt];
\begin{scope}
\node [system,draw=orange,text=orange] (model3) at (0,0) {Model $3$};
\node [system,draw=ugreen,text=ugreen,anchor=south] (model2) at ([yshift=0.3cm]model3.north) {Model $2$};
\node [system,draw=red,text=red,anchor=south] (model1) at ([yshift=0.3cm]model2.north) {Model $1$};
\node [output,draw=orange,text=orange,anchor=west] (output3) at ([xshift=0.5cm]model3.east) {Output $3$};
\node [output,draw=ugreen,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]model2.east) {Output $2$};
\node [output,draw=red,text=red,anchor=west] (output1) at ([xshift=0.5cm]model1.east) {Output $1$};
\draw [->,very thick] (model1) to (output1);
\draw [->,very thick] (model2) to (output2);
\draw [->,very thick] (model3) to (output3);
\begin{pgfonlayer}{background}
\node [draw,thick,dashed,rounded corners=3pt,inner sep=2pt,fit=(output1) (output2) (output3)] (output) {};
\end{pgfonlayer}
\node [dot,anchor=west] (lattice1) at ([shift={(1.5cm,0.5cm)}]output2.east) {};
\node [dot,anchor=west] (lattice2) at ([shift={(1cm,0)}]lattice1.east) {};
\node [dot,anchor=west] (lattice3) at ([shift={(1cm,0)}]lattice2.east) {};
\node [dot,anchor=west] (lattice4) at ([shift={(1.5cm,-0.5cm)}]output2.east) {};
\node [dot,anchor=west] (lattice5) at ([shift={(1cm,0)}]lattice4.east) {};
\draw [-latex,blue] (lattice1) to [out=30,in=150] (lattice2);
\draw [-latex,blue] (lattice2) to [out=30,in=150] (lattice3);
\draw [-latex,blue] (lattice4) to [out=15,in=-120] (lattice2);
\draw [-latex,blue] (lattice4) to [out=-30,in=-150] (lattice5);
\draw [-latex,blue] (lattice5) to [out=15,in=-120] (lattice3);
\draw [-latex,blue] (lattice5) to [out=-60,in=-90] (lattice3);
\begin{pgfonlayer}{background}
\node [draw=blue,fill=white,drop shadow,thick,rounded corners=3pt,inner sep=5pt,fit=(lattice1) (lattice2) (lattice3) (lattice4) (lattice5),label={[font=\tiny,label distance=0pt]90:Lattice}] (lattice) {};
\end{pgfonlayer}
\draw [->,very thick] (output) to (lattice);
\node [system,draw=purple,text=purple,anchor=west] (model) at ([xshift=5.3cm]output1.east) {Model};
\node [output,draw=ublue,text=ublue,minimum width=1cm,right=1.3cm of lattice] (final) {Final Output};
\draw [->,very thick] (model) |- (final);
\draw [->,very thick] (lattice) -- (final);
\end{scope}
\end{tikzpicture}
}
\begin{tikzpicture}
\tikzstyle{layer} = [rectangle,draw,rounded corners=3pt,minimum width=1cm,minimum height=0.5cm];
\tikzstyle{prob} = [minimum width=0.3cm,rectangle,fill=ugreen!20!white,inner sep=0pt];
\begin{scope}[local bounding box=STANDARD]
\node [] (input1) at (0,0) {$\cdots$};
\node [anchor=south,layer,fill=orange!15!white] (net1) at ([yshift=0.5cm]input1.north) {};
\node [anchor=south,layer,fill=orange!15!white] (out1) at ([yshift=0.5cm]net1.north) {};
\node [anchor=south,prob,minimum height=0.9cm] (prob5) at ([yshift=1.2cm]out1.north) {};
\node [anchor=south east,prob,minimum height=0.1cm] (prob4) at ([xshift=-1pt]prob5.south west) {};
\node [anchor=south east,prob,minimum height=0.2cm] (prob3) at ([xshift=-1pt]prob4.south west) {};
\node [anchor=south east,prob,minimum height=0.5cm] (prob2) at ([xshift=-1pt]prob3.south west) {};
\node [anchor=south east,prob,minimum height=0.4cm] (prob1) at ([xshift=-1pt]prob2.south west) {};
\node [anchor=south west,prob,minimum height=0.6cm] (prob6) at ([xshift=1pt]prob5.south east) {};
\node [anchor=south west,prob,minimum height=0.3cm] (prob7) at ([xshift=1pt]prob6.south east) {};
\node [anchor=south west,prob,minimum height=0.2cm] (prob8) at ([xshift=1pt]prob7.south east) {};
\node [anchor=south west,prob,minimum height=0.1cm] (prob9) at ([xshift=1pt]prob8.south east) {};
\path [fill=blue!20!white,draw=white] (out1.north west) -- (prob1.south west) -- (prob9.south east) -- (out1.north east) -- (out1.north west);
\draw [->] (input1) to (net1);
\draw [->] (net1) to (out1);
\node [font=\small] (label1) at ([yshift=0.6cm]out1.north) {Softmax};
\end{scope}
\begin{scope}[local bounding box=SELECTION]
\node [] (input2) at (4.5cm,0) {$\cdots$};
\node [anchor=south,layer,fill=orange!15!white] (net2) at ([yshift=0.5cm]input2.north) {};
\node [anchor=south,layer,fill=orange!15!white] (out2) at ([yshift=0.5cm]net2.north) {};
\node [anchor=south,prob,minimum height=0.9cm] (prob5) at ([yshift=1.2cm]out2.north) {};
\node [anchor=south east,prob,minimum height=0.1cm,opacity=0] (prob4) at ([xshift=-1pt]prob5.south west) {};
\node [text=red,anchor=south,inner sep=1pt] () at (prob4.south) {$\times$};
\node [anchor=south east,prob,minimum height=0.2cm,opacity=0] (prob3) at ([xshift=-1pt]prob4.south west) {};
\node [text=red,anchor=south,inner sep=1pt] () at (prob3.south) {$\times$};
\node [anchor=south east,prob,minimum height=0.5cm] (prob2) at ([xshift=-1pt]prob3.south west) {};
\node [anchor=south east,prob,minimum height=0.4cm] (prob1) at ([xshift=-1pt]prob2.south west) {};
\node [anchor=south west,prob,minimum height=0.6cm,opacity=0] (prob6) at ([xshift=1pt]prob5.south east) {};
\node [text=red,anchor=south,inner sep=1pt] () at (prob6.south) {$\times$};
\node [anchor=south west,prob,minimum height=0.3cm,opacity=0] (prob7) at ([xshift=1pt]prob6.south east) {};
\node [text=red,anchor=south,inner sep=1pt] () at (prob7.south) {$\times$};
\node [anchor=south west,prob,minimum height=0.2cm] (prob8) at ([xshift=1pt]prob7.south east) {};
\node [anchor=south west,prob,minimum height=0.1cm,opacity=0] (prob9) at ([xshift=1pt]prob8.south east) {};
\node [text=red,anchor=south,inner sep=1pt] (plabel9) at (prob9.south) {$\times$};
\path [fill=blue!20!white,draw=white] (out2.north west) -- (prob1.south west) -- (prob9.south east) -- (out2.north east) -- (out2.north west);
\draw [->] (input2) to (net2);
\draw [->] (net2) to (out2);
\node [font=\small] (label2) at ([yshift=0.6cm]out2.north) {Softmax};
\node [anchor=west,layer,fill=orange!15!white] (net3) at ([xshift=2cm]net2.east) {};
\node [anchor=north,font=\scriptsize] (input3) at ([yshift=-0.5cm]net3.south) {源语};
\node [anchor=south,layer,align=center,font=\scriptsize,fill=yellow!10!white] (out3) at ([yshift=0.9cm]net3.north) {Candidate\\List};
\draw [->] (input3) to (net3);
\draw [->] (net3) to (out3);
\draw [->] (out3) |- (plabel9.east);
\end{scope}
\node [anchor=north,font=\scriptsize] () at ([yshift=-0.2em]STANDARD.south) {(a) 标准方法};
\node [anchor=north,font=\scriptsize] () at ([xshift=-3em]SELECTION.south) {(b) 词汇选择};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\tikzstyle{word} = [font=\scriptsize]
\tikzstyle{model} = [rectangle,draw,minimum height=2.5em,minimum width=5em,rounded corners=4pt,fill=blue!15!white]
\node [model,minimum width=10.5em] (encoder0) at (0,0) {Encoder};
\node [word] (w1) at ([yshift=-2em,xshift=1em]encoder0.south) {\#};
\node [word] (w2) at ([xshift=-1em]w1.west) {\#};
\node [word] (w3) at ([xshift=-1em]w2.west) {$x_2$};
\node [word] (w4) at ([xshift=-1em]w3.west) {$x_1$};
\node [word] (w5) at ([xshift=1em]w1.east) {\#};
\node [word] (w6) at ([xshift=1em]w5.east) {$x_6$};
\node [word] (w7) at ([yshift=2em,xshift=1em]encoder0.north) {$x_4$};
\node [word] (w8) at ([yshift=0em,xshift=-1em]w7.west) {$x_3$};
\node [word] (w9) at ([yshift=0em,xshift=1em]w7.east) {$x_5$};
\draw [->] (w1.north) -- ([yshift=1.3em]w1.north);
\draw [->] (w2.north) -- ([yshift=1.3em]w2.north);
\draw [->] (w3.north) -- ([yshift=1.4em]w3.north);
\draw [->] (w4.north) -- ([yshift=1.4em]w4.north);
\draw [->] (w5.north) -- ([yshift=1.3em]w5.north);
\draw [->] (w6.north) -- ([yshift=1.4em]w6.north);
\draw [->] (w7.south) -- ([yshift=-1.4em]w7.south);
\draw [->] (w8.south) -- ([yshift=-1.4em]w8.south);
\draw [->] (w9.south) -- ([yshift=-1.4em]w9.south);
\node [model] (encoder1) at ([xshift=8em]encoder0.east) {Encoder};
\node [model,fill=red!15!white] (decoder) at ([xshift=5em]encoder1.east) {Decoder};
\node [] (sinput) at ([yshift=-3em]encoder1.south) {\footnotesize{源语输入}};
\node [] (tinput) at ([yshift=-3em]decoder.south) {\footnotesize{目标语输入}};
\node [] (output) at ([yshift=3em]decoder.north) {\footnotesize{目标语输出}};
\draw [->] (sinput) -- (encoder1);
\draw [->] (tinput) -- (decoder);
\draw [->] (decoder) -- (output);
\coordinate (do0) at ([yshift=1em]encoder1.north);
\coordinate (do1) at ([xshift=3.5em]do0.east);
\coordinate (do2) at ([yshift=-2.3em]do1.south);
\draw [-] (encoder1.north) -- (do0);
\draw [-] (do0) -- (do1);
\draw [-] (do1) -- (do2);
\draw [->] (do2) -- (decoder.west);
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=1em,fill=black!5,rounded corners=4pt] [fit =(w4) (w6) (w9) (encoder0) ] (box) {};
\end{pgfonlayer}
\node [] (left) at ([yshift=-1.5em]box.south) {编码器使用单语数据预训练};
\node [] (right) at ([xshift=11em]left.east) {在翻译任务上进行微调};
\node[anchor=north] (arrow1) at (3.7,0.1){};
\draw[fill=yellow!20]([yshift=-0.3em]arrow1.north)--([xshift=-1em,yshift=0.5em]arrow1.north west)--([xshift=-1em,yshift=0.1em]arrow1.north west)--([xshift=-2.6em,yshift=0.1em]arrow1.north west)--([xshift=-2.6em,yshift=-0.1em]arrow1.south west)--([xshift=-1em,yshift=-0.1em]arrow1.south west)--([xshift=-1em,yshift=-0.5em]arrow1.south west)--([yshift=-0.3em]arrow1.north);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\newlength{\YShift}
\newlength{\XShift}
\setlength{\YShift}{0.8\base}
\setlength{\XShift}{0.8\base}
\tikzstyle{modelnode} = [rectangle,draw,rounded corners=2pt,inner sep=0pt,minimum height=4.2em,minimum width=2em,font=\small,anchor=north]
\coordinate (stu01) at (0,0);
\coordinate (stu02) at ([xshift=3em]stu01);
\coordinate (stu03) at ([xshift=3em]stu02);
\coordinate (stu04) at ([xshift=3em]stu03);
\coordinate (stu05) at ([xshift=3em]stu04);
\coordinate (tea01) at ([xshift=8em]stu05);
\coordinate (tea02) at ([xshift=3em]tea01);
% iterations
\foreach \curr / \prev in {1/0,2/1,3/2}
{
% models
\node[modelnode,fill=yellow!20] (stu\curr1) at ([yshift=-2em]stu\prev1.south) {\rotatebox{90}{Student $1$}};
\node[modelnode,fill=yellow!20] (stu\curr2) at ([yshift=-2em]stu\prev2.south) {\rotatebox{90}{Student $2$}};
\node[modelnode,fill=yellow!20] (stu\curr3) at ([yshift=-2em]stu\prev3.south) {\rotatebox{90}{Student $3$}};
\node[modelnode,fill=yellow!20] (stu\curr4) at ([yshift=-2em]stu\prev4.south) {\rotatebox{90}{Student $4$}};
\node[modelnode,fill=yellow!20] (stu\curr5) at ([yshift=-2em]stu\prev5.south) {\rotatebox{90}{Student $5$}};
\node[modelnode] (tea\curr1) at ([yshift=-2em]tea\prev1.south) {\rotatebox{90}{\color{red!60} Teacher $1$}};
\node[modelnode] (tea\curr2) at ([yshift=-2em]tea\prev2.south) {\rotatebox{90}{\color{blue!60} Teacher $2$}};
% ensemble labels
\draw[-latex'] ([xshift=2pt]stu\curr5.east) to node [auto] {\small Ensemble} ([xshift=-2pt]tea\curr1.west);
}
% iteration labels
\node[font=\small,anchor=east,purple!80] (iterate1) at ([xshift=-1em]stu21.west) {\rotatebox{90}{Iteration $1$}};
\node[font=\small,anchor=east,purple!80] (iterate2) at ([xshift=-1em]stu31.west) {\rotatebox{90}{Iteration $2$}};
% distillation labels
\node[font=\small,anchor=south west] (distill1) at ([yshift=0.2em]iterate1.north west) {Distillation};
\node[font=\small,anchor=south west] (distill2) at ([yshift=0.2em]iterate2.north west) {Distillation};
% student groups
\begin{pgfonlayer}{background}
\node[rectangle,draw,very thick,red!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=red!20] [fit = (stu21) (stu22) (stu23) ] (group21) {};
\node[rectangle,draw,very thick,blue!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=blue!20] [fit = (stu24) (stu25) ] (group22) {};
\node[rectangle,draw,very thick,blue!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=blue!20] [fit = (stu31) (stu32) ] (group31) {};
\node[rectangle,draw,very thick,red!60,densely dotted,inner sep=2pt,rounded corners=2pt,fill=red!20] [fit = (stu33) (stu34) (stu35) ] (group32) {};
\end{pgfonlayer}
% distillation
\draw[-latex',red!60,very thick] (tea11.south) .. controls +(south:1.5em) and +(north:2em) .. (group21.north);
\draw[-latex',blue!60,very thick] (tea12.south) .. controls +(south:2em) and +(north:1.5em) .. (group22.north);
\draw[-latex',red!60,very thick] (tea21.south) .. controls +(south:2em) and +(north:2.5em) .. (group32.north);
\draw[-latex',blue!60,very thick] (tea22.south) .. controls +(south:2em) and +(north:1.5em) .. (group31.north);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (node1) at (-2.3,1) {\small{训练:}};
\node [anchor=center] (node11) at (-2.0,1) {};
\node [anchor=center] (node12) at (-1.1,1) {};
\node [anchor=center] (node2) at (-2.3,0.5) {\small{推理:}};
\node [anchor=center] (node21) at (-2.0,0.5) {};
\node [anchor=center] (node22) at (-1.1,0.5) {};
\node [anchor=west,draw=black,minimum width=5.6em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-1) at (0,0) {\footnotesize{双语数据}};
\node [anchor=south,draw=black,minimum width=4.5em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node1-2) at ([yshift=-5em]node1-1.south) {\footnotesize{目标语伪数据}};
\node [anchor=west,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node2-1) at ([xshift=-7.3em,yshift=-2.5em]node1-1.west) {\footnotesize{前向NMT系统}};
\node [anchor=west,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node3-1) at ([xshift=1.5em,yshift=-2.5em]node1-1.east) {\footnotesize{反向NMT系统}};
\node [anchor=east,draw=black,minimum width=5.6em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node4-1) at ([xshift=18em]node1-1) {\footnotesize{双语数据}};
\node [anchor=south,draw=black,minimum width=4.5em,minimum height=2.2em,fill=blue!20,rounded corners=2pt] (node4-2) at ([yshift=-5em]node4-1.south) {\footnotesize{目标语伪数据}};
\node [anchor=east,draw=black,minimum width=4.5em,minimum height=2.2em,fill=red!20,rounded corners=2pt] (node5-1) at ([xshift=15.5em]node3-1.east) {\footnotesize{前向NMT系统}};
\draw [-stealth](node1-1.west)--([xshift=3em]node2-1.north);
\draw [-stealth](node1-1.east)--([xshift=-3em]node3-1.north);
\draw [-stealth](node1-2.east)--([xshift=-3em]node3-1.south);
\draw [-stealth](node11.east)--(node12.west);
\draw [-stealth,dashed](node21.east)--(node22.west);
\draw [-stealth,dashed]([xshift=3em]node2-1.south)--(node1-2.west);
\draw [-stealth,dashed]([xshift=3em]node3-1.south)--(node4-2.west);
\draw [-stealth](node4-1.east)--([xshift=-3em]node5-1.north);
\draw [-stealth](node4-2.east)--([xshift=-3em]node5-1.south);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\node [anchor=north west] (part1) at (0,0) {\small{$\begin{bmatrix} \textrm{Have} \; 0.5 \\ \textrm{Has} \ \ \; 0.1 \\ . \\ . \\ . \\ . \\ . \end{bmatrix}$}};
\node [anchor=north](p1) at ([yshift=-0.3em]part1.south) {$P_1$};
\node [anchor=west](part2) at ([xshift=0.5em]part1.east){\small{$\begin{bmatrix} \textrm{Have} \; 0.2 \\ \textrm{Has} \ \ \; 0.3 \\ . \\ . \\ . \\ . \\ . \end{bmatrix}$}};
\node [anchor=north](p2) at ([yshift=-0.3em]part2.south) {$P_2$};
\node [anchor=west](part3) at ([xshift=0.5em]part2.east){\small{$\begin{bmatrix} \textrm{Have} \; 0.4 \\ \textrm{Has} \ \ \; 0.3 \\ . \\ . \\ . \\ . \\ . \end{bmatrix}$}};
\node [anchor=north](p3) at ([yshift=-0.3em]part3.south) {$P_3$};
\node [anchor=west](part4) at ([xshift=0.5em]part3.east){\huge{$\Rightarrow$}};
\node [anchor=west](part5) at ([xshift=0.5em]part4.east){\small{$\begin{bmatrix} \textrm{Have} \; 0.37 \\ \textrm{Has} \ \ \; 0.23 \\ . \\ . \\ . \\ . \\ . \end{bmatrix}$}};
\node [anchor=north](p5) at (part5.south) {$P=\sum_{i=1}^{3}{\frac{1}{3}P_{i}}$};
\end{tikzpicture}
\begin{tikzpicture}
\tikzstyle{system} = [rectangle,very thick,minimum width=1.5cm,font=\scriptsize];
\tikzstyle{output} = [rectangle,very thick,rounded corners=3pt,minimum width=1.5cm,align=center,font=\scriptsize];
\begin{scope}[local bounding box=MULTIPLE]
\node [system,draw=orange,text=orange] (engine3) at (0,0) {Engine $n$};
\node [system,draw=ugreen,text=ugreen,anchor=south] (engine2) at ([yshift=0.6cm]engine3.north) {Engine $2$};
\node [system,draw=red,text=red,anchor=south] (engine1) at ([yshift=0.3cm]engine2.north) {Engine $1$};
\node [output,draw=orange,text=orange,anchor=west] (output3) at ([xshift=0.5cm]engine3.east) {Output $n$};
\node [output,draw=ugreen,text=ugreen,anchor=west] (output2) at ([xshift=0.5cm]engine2.east) {Output $2$};
\node [output,draw=red,text=red,anchor=west] (output1) at ([xshift=0.5cm]engine1.east) {Output $1$};
\draw [very thick,decorate,decoration={brace}] ([xshift=3pt]output1.north east) to node [midway,name=final] {} ([xshift=3pt]output3.south east);
\node [output,draw=ublue,text=ublue,minimum width=1cm,right=0pt of final,minimum height=2.5em] () {Final\\Output};
\draw [->,very thick] (engine1) to (output1);
\draw [->,very thick] (engine2) to (output2);
\draw [->,very thick] (engine3) to (output3);
\node [] () at ([yshift=0.4cm]output3.north) {$\vdots$};
\end{scope}
\begin{scope}[local bounding box=SINGLE]
\node [output,draw=ugreen,text=ugreen,anchor=west] (output3) at ([xshift=4cm]output3.east) {Output $n$};
\node [output,draw=ugreen,text=ugreen,anchor=west] (output2) at ([xshift=4cm]output2.east) {Output $2$};
\node [output,draw=ugreen,text=ugreen,anchor=west] (output1) at ([xshift=4cm]output1.east) {Output $1$};
\node [system,draw=ugreen,text=ugreen,anchor=east,align=center,inner sep=1.9pt] (engine) at ([xshift=-0.5cm]output2.west) {Single\\Engine};
\draw [very thick,decorate,decoration={brace}] ([xshift=3pt]output1.north east) to node [midway,name=final] {} ([xshift=3pt]output3.south east);
\node [output,draw=ublue,text=ublue,minimum width=1cm,right=0pt of final,minimum height=2.5em] () {Final\\Output};
\draw [->,very thick] (engine.east) to (output1.west);
\draw [->,very thick] (engine.east) to (output2.west);
\draw [->,very thick] (engine.east) to (output3.west);
\node [] () at ([yshift=0.4cm]output3.north) {$\vdots$};
\end{scope}
\node [align=center,anchor=north,font=\small] () at ([yshift=-0.3cm]MULTIPLE.south) {(a) combing outputs from\\multiple translation engines};
\node [align=center,anchor=north,font=\small] () at ([yshift=-0.3cm]SINGLE.south) {(b) combing outputs from a\\single translation engine};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\setlength{\base}{1.2em}
\tikzstyle{node} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=green!30!white]
\tikzstyle{node2} = [rounded corners=1pt,minimum width=1.2em,minimum height=1.2em,draw,fill=blue!30!white]
\node[node] (enc1) at (0,0) {};
\node[node] (enc2) at ([xshift = \base]enc1.east) {};
\node[node] (enc3) at ([xshift = \base]enc2.east) {};
\node[node] (enc4) at ([xshift = \base]enc3.east) {};
\node[node] (enc5) at ([xshift = \base]enc4.east) {};
\node[node] (enc6) at ([xshift = \base]enc5.east) {};
\node[] (enc7) at ([xshift = \base]enc6.east) {...};
\node[node] (enc8) at ([xshift = \base]enc7.east) {};
\node[node] (enc9) at ([xshift = \base]enc8.east) {};
\node[node] (enc10) at ([xshift = \base]enc9.east) {};
\node[font=\scriptsize,rotate=270] (src) at ([xshift = -\base]enc1.west) {src};
\draw [->] ([xshift=-0.75em]enc1.west) -- (enc1.west);
\draw [decorate,decoration={brace}] ([yshift=0.3em]enc1.north west) to node [auto,anchor=south,font=\scriptsize] {Nx} ([yshift=0.3em]enc10.north east);
\draw [->] (enc1.east) -- (enc2.west);
\draw [->] (enc2.east) -- (enc3.west);
\draw [->] (enc3.east) -- (enc4.west);
\draw [->] (enc4.east) -- (enc5.west);
\draw [->] (enc5.east) -- (enc6.west);
\draw [->] (enc8.east) -- (enc9.west);
\draw [->] (enc9.east) -- (enc10.west);
\node[node2,anchor=north] (dec1) at ([yshift=-2em]enc1.south) {};
\node[node2,anchor=north] (dec2) at ([yshift=-2em]enc2.south) {};
\node[node2,anchor=north] (dec3) at ([yshift=-2em]enc3.south) {};
\node[node2,anchor=north] (dec4) at ([yshift=-2em]enc4.south) {};
\node[node2,anchor=north] (dec5) at ([yshift=-2em]enc5.south) {};
\node[node2,anchor=north] (dec6) at ([yshift=-2em]enc6.south) {};
\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = -\base]dec1.west) {tgt};
\node[font=\scriptsize,rotate=270] (tgt) at ([xshift = \base]dec6.east) {out};
\draw [->] ([xshift=-0.75em]dec1.west) -- (dec1.west);
\draw [->] (dec6.east) -- ([xshift=0.75em]dec6.east);
\draw [decorate,decoration={brace,mirror}] ([yshift=-0.3em]dec1.south west) to node [auto,anchor=north,font=\scriptsize] {6x} ([yshift=-0.3em]dec6.south east);
\draw [->] (dec1.east) -- (dec2.west);
\draw [->] (dec2.east) -- (dec3.west);
\draw [->] (dec3.east) -- (dec4.west);
\draw [->] (dec4.east) -- (dec5.west);
\draw [->] (dec5.east) -- (dec6.west);
\node[node] (enc_legend) at ([xshift = 2\base]enc10.east) {};
\node[node2,anchor=north] (dec_legend) at ([yshift = -\base]enc_legend.south) {};
\node[font=\scriptsize,anchor=west] (line1) at (enc_legend.east) {:编码层};
\node[font=\scriptsize,anchor=west] (line1) at (dec_legend.east) {:解码层};
%\node[node] (dec1) at ([xshift=4em]enc1.east) {Decoder};
%\node[node2] (enc2) at ([xshift=4em]dec1.east) {Encoder};
%\node[node] (dec2) at ([xshift=4em]enc2.east) {Decoder};
\coordinate (c1) at ([xshift=1em]enc10.east);
\coordinate (c2) at ([yshift=-1.6em]c1.south);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec1.north) -- (dec1.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec2.north) -- (dec2.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec3.north) -- (dec3.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec4.north) -- (dec4.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec5.north) -- (dec5.north);
\draw [->,rounded corners] (enc10.east) -- (c1) -- (c2)--([yshift=1em]dec6.north) -- (dec6.north);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\node[font=\scriptsize] (model) at (0,0) {Model Output:};
\node[anchor=north west,font=\scriptsize] (label_smooth) at ([yshift=-1.8em]model.south west) {Label Smoothing:};
\node[anchor=south west,font=\scriptsize] (one-hot) at ([yshift=2em]model.north west) {One-hot:};
\node [anchor=west,minimum width=1em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label1) at ([xshift=2em,yshift=-0.5em]model.east) {};
\node [anchor=south,font=\scriptsize] (model_w1) at (model_label1.north) {$p_{1}$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (model_label2) at (model_label1.south east) {};
\node [anchor=south,font=\scriptsize] (model_w2) at (model_label2.north) {$p_{2}$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.7em,fill=ublue!80,inner sep=0pt] (model_label3) at (model_label2.south east) {};
\node [anchor=south,font=\scriptsize] (model_w3) at (model_label3.north) {{\color{red} $p_{3}$}};
\node [anchor=south west,minimum width=1.2em,minimum height=0.4em,fill=ublue!80,inner sep=0pt] (model_label4) at (model_label3.south east) {};
\node [anchor=south,font=\scriptsize] (model_w5) at (model_label4.north) {$p_{4}$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.1em,fill=ublue!80,inner sep=0pt] (model_label5) at (model_label4.south east) {};
\node [anchor=south,font=\scriptsize] (model_w6) at (model_label5.north) {$p_{5}$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.3em,fill=ublue!80,inner sep=0pt] (model_label6) at (model_label5.south east) {};
\node [anchor=south,font=\scriptsize] (model_w7) at (model_label6.north) {$p_{6}$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=ublue!80,inner sep=0pt] (model_label7) at (model_label6.south east) {};
\node [anchor=south,font=\scriptsize] (model_w8) at (model_label7.north) {$p_{7}$};
%no label smooth
\node [anchor=west,minimum width=1.2em,minimum height=0em,inner sep=0pt,font=\scriptsize] (one_hot_label1) at ([xshift=2em,yshift=3em]model.east) {$0$};
\node [anchor=south west,minimum width=1.2em,minimum height=0em,inner sep=0pt,font=\scriptsize] (one_hot_label2) at (one_hot_label1.south east) {$0$};
\node [anchor=south west,minimum width=1.2em,minimum height=1.5em,fill=orange!50,inner sep=0pt] (one_hot_label3) at (one_hot_label2.south east) {};
\node [anchor=south,font=\scriptsize] (one_hot_w3) at (one_hot_label3.north) {{\color{red} $1$}};
\node [anchor=south west,minimum width=1.2em,minimum height=0em,inner sep=0pt,font=\scriptsize] (one_hot_label4) at (one_hot_label3.south east) {$0$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.1em,inner sep=0pt,font=\scriptsize] (one_hot_label5) at (one_hot_label4.south east) {$0$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.3em,inner sep=0pt,font=\scriptsize] (one_hot_label6) at (one_hot_label5.south east) {$0$};
\node [anchor=south west,minimum width=1em,minimum height=0.4em,inner sep=0pt,font=\scriptsize] (one_hot_label7) at (one_hot_label6.south east) {$0$};
%label smoothing
\node [anchor=west,minimum width=1em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label1) at ([xshift=2em,yshift=-3.2em]model.east) {};
\node [anchor=south,font=\scriptsize] (w1) at (label1.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label2) at (label1.south east) {};
\node [anchor=south,font=\scriptsize] (w2) at (label2.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.8em,fill=red!50,inner sep=0pt] (label3) at (label2.south east) {};
\node [anchor=south,font=\scriptsize] (w3) at (label3.north) {{\color{red} $0.4$}};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label4) at (label3.south east) {};
\node [anchor=south,font=\scriptsize] (w5) at (label4.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label5) at (label4.south east) {};
\node [anchor=south,font=\scriptsize] (w6) at (label5.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label6) at (label5.south east) {};
\node [anchor=south,font=\scriptsize] (w7) at (label6.north) {$0.1$};
\node [anchor=south west,minimum width=1.2em,minimum height=0.2em,fill=red!50,inner sep=0pt] (label7) at (label6.south east) {};
\node [anchor=south,font=\scriptsize] (w8) at (label7.north) {$0.1$};
\node[font=\scriptsize] (line1) at ([xshift=9em,yshift=-1.5em]model_label7.east) {$loss =-0.3 \log p_{3}-\sum_{i=1}^{7} 0.1 \log p_{i}$};
\node[font=\scriptsize] (line2) at ([xshift=5.9em,yshift=3.5em]model_label7.east) {$loss =-\log p_{3}$};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=red] [fit = (one_hot_label1) (one_hot_w3) (one_hot_label7) (model_label1) (model_label7)] (box1) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=purple!10,drop shadow,draw=purple] [fit = (line2)] (box3) {};
\draw [->,dotted,very thick,red] ([yshift=-1em]box1.east) .. controls +(east:1) and +(west:1) .. (box3.west);
\node [rectangle,inner sep=0.1em,rounded corners=1pt,very thick,dotted,draw=ugreen] [fit = (label1) (label7) (model_label1) (model_label7) (model_w3)] (box2) {};
\node [rectangle,inner sep=0.2em,rounded corners=1pt,fill=green!10,drop shadow,draw=ugreen] [fit = (line1)] (box4) {};
\draw [->,dotted,very thick,ugreen] ([yshift=1em]box2.east) .. controls +(east:1) and +(west:1) .. (box4.west);
\end{pgfonlayer}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}\small
\node [anchor=north] (n1) at (0, 0) {开始};
\draw [->,very thick] ([xshift=0em,yshift=1em]n1.north west)--([xshift=20em,yshift=1em]n1.north west);
\draw [->,very thick] ([xshift=0em,yshift=1em]n1.north west)--([xshift=0em,yshift=10em]n1.north west);
\node [anchor=west] (n2) at ([xshift=15em,yshift=0em]n1.east) {成熟};
\node [anchor=south] (n3) at ([xshift=-4em,yshift=8em]n1.north) {翻译品质};
\draw [-,very thick,draw=ublue] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:7em) and +(south:0em) .. ([xshift=17em,yshift=9em]n1.north);
{\footnotesize
\node [anchor=south] (n4) at ([xshift=7em,yshift=5em]n1.north) {性能快速爬升阶段};
\node [anchor=west] (n5) at ([xshift=0em,yshift=-2em]n4.west) {数据的作用会非常明显};
}
\draw [-,thick] ([xshift=2.3em,yshift=-2em]n4.east)--([xshift=2.3em,yshift=2em]n4.north east);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
%左
\node [anchor=west,draw=black,very thick,minimum width=6em,minimum height=3.5em,fill=blue!15,align=center,text=black] (part1) at (0,0) {\scriptsize{预测模块} \\ \tiny{(RNN/Transsformer)}};
\node [anchor=south] (text) at ([xshift=0.5em,yshift=-3.5em]part1.south) {\scriptsize{源语言句子(编码)}};
\node [anchor=east,draw=black,very thick,minimum width=6em,minimum height=3.5em,fill=blue!15,align=center,text=black] (part2) at ([xshift=10em]part1.east) {\scriptsize{搜索模块}};
\node [anchor=south] (text1) at ([xshift=0.5em,yshift=2.2em]part1.north) {\scriptsize{已经生成的目标语单词}};
\node [anchor=south] (text2) at ([xshift=0.5em,yshift=2.2em]part2.north) {\scriptsize{预测当前位置的单词分布}};
\draw [->,draw=black, thick] ([yshift=2em]part1.north) -- ([yshift=0.1em]part1.north);
\draw [->,draw=black, thick] ([yshift=-2em]part1.south) -- ([yshift=-0.1em]part1.south);
\draw [->,draw=black, thick] ([yshift=0em]part2.north) -- ([yshift=2em]part2.north);
\draw [->,draw=black,very thick] ([yshift=-0.7em]part1.east) -- ([xshift=-0.05em,yshift=-0.7em]part2.west);
\draw [->,draw=black,very thick,dashed] ([yshift=0.7em]part2.west) -- ([xshift=0.05em,yshift=0.7em]part1.east);
\end{tikzpicture}
\begin{tikzpicture}
\tikzstyle{op} =[rounded corners=1pt,thick,minimum width=4.0em,minimum height=3.0em,draw,fill=red!5!white,font=\scriptsize]
\tikzstyle{data} = [cylinder,draw=black,thick,minimum height=3em,minimum width=3em,shape border rotate=0,cylinder uses custom fill, cylinder body fill=blue!10,cylinder end fill=blue!5,anchor = east,font=\scriptsize]
\node[op] (node1) at (0,0) {分词};
\node[op,anchor = west] (node2) at ([xshift = 2.0em]node1.east) {符号标准化};
\node[op,anchor = west] (node3) at ([xshift = 2.0em]node2.east) {数据过滤};
\node [data,anchor = east] (data1) at ([xshift = -2.0em]node1.west){原始数据};
\node [data,anchor = west] (data2) at ([xshift = 2.0em]node3.east){训练数据};
\draw[-stealth,line width=.05cm] ([xshift=0.25em]data1.east) -- ([xshift=-0.25em]node1.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node1.east) -- ([xshift=-0.25em]node2.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node2.east) -- ([xshift=-0.25em]node3.west);
\draw[-stealth,line width=.05cm] ([xshift=0.25em]node3.east) -- ([xshift=-0.25em]data2.west);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\def\neuronsep{1}
\tikzstyle{neuronnode} = [minimum size=1.0em,circle,draw,thick,ublue,inner sep=1pt, fill=white,align=center]
%standard
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron0\n) at (0,\n * \neuronsep) {};
}
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron1\n) at (1.2\neuronsep ,\n * \neuronsep) {};
}
\foreach \n in {1,...,4}{
\node [neuronnode] (neuron2\n) at (2.4*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode] (neuron3) at (3.6*\neuronsep ,2.5 * \neuronsep) {};
\foreach \n in {1,...,4}{
\foreach \m in {1,...,4}{
\draw [->] (neuron0\n.east) -- (neuron1\m.west);
}
}
\foreach \n in {1,...,4}{
\foreach \m in {1,...,4}{
\draw [->] (neuron1\n.east) -- (neuron2\m.west);
}
}
\foreach \n in {1,...,4}{
\draw [->] (neuron2\n.east) -- (neuron3.west);
}
%drop
%layer1
\foreach \n in {1,3,4}{
\node [neuronnode] (neuron4\n) at (5*\neuronsep,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron42) at (5*\neuronsep,2 * \neuronsep) {};
%layer1
\foreach \n in {1,2,4}{
\node [neuronnode] (neuron5\n) at (6.2*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron53) at (6.2*\neuronsep,3 * \neuronsep) {};
%layer3
\foreach \n in {1,4}{
\node [neuronnode] (neuron6\n) at (7.4*\neuronsep ,\n * \neuronsep) {};
}
\node [neuronnode,dashed] (neuron62) at (7.4*\neuronsep ,2 * \neuronsep) {};
\node [neuronnode,dashed] (neuron63) at (7.4*\neuronsep ,3 * \neuronsep) {};
%layer4
\node [neuronnode] (neuron7) at (8.6*\neuronsep ,2.5 * \neuronsep) {};
\foreach \n in {1,3,4}{
\foreach \m in {1,2,4}{
\draw [->] (neuron4\n.east) -- (neuron5\m.west);
}
}
\foreach \n in {1,2,4}{
\foreach \m in {1,4}{
\draw [->] (neuron5\n.east) -- (neuron6\m.west);
}
}
\foreach \n in {1,4}{
\draw [->] (neuron6\n.east) -- (neuron7.west);
}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{sublayernode} = [rectangle,draw,thick,inner sep=3pt,rounded corners=2pt,align=center,minimum height=1.5em,minimum width=1.5em,font=\scriptsize]
\tikzstyle{inputnode} = [rectangle,inner sep=3pt,align=center,font=\scriptsize]
%\tikzstyle{circlenode} = [circle,draw,thick,minimum size=0.3\base,font=\small,inner sep=0pt]
\tikzstyle{mnode} = [circle,thick,minimum size=0.7em,font=\small,inner sep=0pt,draw]
\node[anchor=south west,inputnode] (input) at (0,0) {$x_{i}^{l}$};
\node[anchor=west,sublayernode,fill=red!10] (ln) at ([xshift=1.2em]input.east) {LN};
\node[anchor=west,sublayernode,fill=green!10] (fn) at ([xshift=1.2em]ln.east) {F};
\node[anchor=west,mnode] (m) at ([xshift=2em]fn.east) {};
\node[] (res) at ([xshift=2.4em]fn.east) {+};
\node[anchor=west,sublayernode,fill=red!10] (ln1) at ([xshift=2em]m.east) {LN};
\node[anchor=west,sublayernode,fill=green!10] (fn1) at ([xshift=1.2em]ln1.east) {F};
\node[anchor=west,mnode] (m1) at ([xshift=2em]fn1.east) {};
\node[] (res1) at ([xshift=2.4em]fn1.east) {+};
\node[anchor=west,inputnode] (output) at ([xshift=1.2em]res1.east) {$x_{i}^{l+1}$};
\node[anchor=west,inputnode] (legend1) at (6em,-1em) {(a) 标准Transformer网络};
%\coordinate (mend) at ([xshift=1em]m.west);
\draw[-latex',thick] (input)--(ln);
\draw[-latex',thick] (ln)--(fn);
\draw[-latex',thick] (fn)--(m);
%\draw[-,thick] (mend)--(res);
\coordinate (h) at ([xshift=-0.7em]ln.west);
\draw[-latex',thick,rounded corners] (h) -- ([yshift=1.35em]h.north) -- ([yshift=1em]m.north) -- (m.north);
%\coordinate (mend1) at ([xshift=1.0\hseg]m1.west);
\draw[-latex',thick] (m)--(ln1);
\draw[-latex',thick] (ln1)--(fn1);
\draw[-latex',thick] (fn1)--(m1);
%\draw[-,thick] (mend1)--(res1);
\draw[-latex',thick] (m1)--(output);
\coordinate (h1) at ([xshift=-0.7em]ln1.west);
\draw[-latex',thick,rounded corners] (h1) -- ([yshift=1.35em]h1.north) -- ([yshift=1em]m1.north) -- (m1.north);
%--------------------------------------------------------
\node[anchor=south west,inputnode] (input_2) at (0,-6em) {$x_{i}^{l}$};
\node[anchor=west,sublayernode,fill=red!10] (ln_2) at ([xshift=1.2em]input_2.east) {LN};
\node[anchor=west,sublayernode,fill=green!10] (fn_2) at ([xshift=1.2em]ln_2.east) {F};
\node[anchor=west,mnode] (m_2) at ([xshift=2em]fn_2.east) {};
\node[] (res_2) at ([xshift=2.4em]fn_2.east) {+};
\node[anchor=west,sublayernode,fill=red!10] (ln1_2) at ([xshift=2em]m_2.east) {LN};
\node[anchor=west,sublayernode,fill=green!10] (fn1_2) at ([xshift=1.2em]ln1_2.east) {F};
\node[anchor=west,mnode] (m1_2) at ([xshift=2em]fn1_2.east) {};
\node[] (res1_2) at ([xshift=2.4em]fn1_2.east) {+};
\node[anchor=west,inputnode] (output_2) at ([xshift=1.2em]res1_2.east) {$x_{i}^{l+1}$};
\node[anchor=west,inputnode] (legend2) at (2.5em,-7.5em) {(b) 引入Layer Dropout后的Transformer网络};
\node[anchor=south west,inputnode,red,font=\tiny] (mlable) at ([xshift=-2.2em,yshift=-0.6em]m_2.south) {M=1};
\node[anchor=south west,inputnode,red,font=\tiny] (mlable1) at ([xshift=-2.2em,yshift=-0.6em]m1_2.south) {M=0};
\coordinate (start_1) at ([xshift=-1.3em]m_2.west);
\coordinate (end_1) at ([xshift=-0.5em]m_2.west);
%\node[red,font=\scriptsize] (dot1) at (start_1) {$\cdot$};
\draw[-latex',thick] (input_2)--(ln_2);
\draw[-latex',thick] (ln_2)--(fn_2);
\draw[-latex',thick] (fn_2)--(start_1);
\draw[-,thick,red] (start_1)--(end_1);
\draw[-,thick] (end_1)--(m_2);
%\draw[-,thick] (mend)--(res);
\coordinate (h_2) at ([xshift=-0.7em]ln_2.west);
\draw[-latex',thick,rounded corners] (h_2) -- ([yshift=1.35em]h_2.north) -- ([yshift=1em]m_2.north) -- (m_2.north);
%\coordinate (mend1) at ([xshift=1.0\hseg]m1.west);
\coordinate (start_2) at ([xshift=-1.3em]m1_2.west);
\coordinate (end_2) at ([xshift=-0.5em]m1_2.west);
\draw[-latex',thick] (m_2)--(ln1_2);
\draw[-latex',thick] (ln1_2)--(fn1_2);
\draw[-latex',thick] (fn1_2)--(start_2);
\draw[-,thick,red] (start_2)--([yshift=0.3em]end_2);
\draw[-,thick] (end_2)--(m1_2);
%\draw[-,thick] (mend1)--(res1);
\draw[-latex',thick] (m1_2)--(output_2);
\coordinate (h1_2) at ([xshift=-0.7em]ln1_2.west);
\draw[-latex',thick,rounded corners] (h1_2) -- ([yshift=1.35em]h1_2.north) -- ([yshift=1em]m1_2.north) -- (m1_2.north);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{node} = [minimum height=1.0em,draw=teal,fill=teal!10]
\node[node,minimum width=2.0em] (sent1) at (0,0) {};
\node[node,minimum width=5.0em,anchor=north west] (sent2) at (sent1.south west) {};
\node[node,minimum width=1.0em,anchor=north west] (sent3) at (sent2.south west) {};
\node[node,minimum width=3.0em,anchor=north west] (sent4) at (sent3.south west) {};
\node[node,minimum width=4.0em] (sent5) at (12em,0) {};
\node[node,minimum width=4.5em,anchor=north west] (sent6) at (sent5.south west) {};
\node[node,minimum width=4.5em,anchor=north west] (sent7) at (sent6.south west) {};
\node[node,minimum width=5em,anchor=north west] (sent8) at (sent7.south west) {};
\node[font=\scriptsize,anchor=east] (line1) at (sent1.west) {sent1};
\node[font=\scriptsize,anchor=east] (line2) at (sent2.west) {sent2};
\node[font=\scriptsize,anchor=east] (line3) at (sent3.west) {sent3};
\node[font=\scriptsize,anchor=east] (line4) at (sent4.west) {sent4};
\node[font=\scriptsize,anchor=east] (line5) at (sent5.west) {sent1};
\node[font=\scriptsize,anchor=east] (line6) at (sent6.west) {sent2};
\node[font=\scriptsize,anchor=east] (line7) at (sent7.west) {sent3};
\node[font=\scriptsize,anchor=east] (line8) at (sent8.west) {sent4};
\begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {};
\end{pgfonlayer}
\node[font=\scriptsize] (node1) at ([yshift=-3em]sent2.south) {随机生成};
\node[font=\scriptsize] (node2) at ([yshift=-1em]sent8.south) {排序生成};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\node [anchor=center] (node1) at (-2.3,0) {\small{$x,y$:双语数据}};
\node [anchor=center] (node2) at (-2.1,-0.5) {\small{$z$}:单语数据};
\node [anchor=center] (node1-1) at (0,0) {\small{$y'$}};
\node [anchor=center] (node3-1) at ([xshift=5.5em,yshift=-0.1em]node1-1.east) {\small{$z'$}};
\node[anchor=south,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node1-2) at ([yshift=-3em]node1-1.south) {\small{softamx}};
\node[anchor=south,draw,rounded corners,minimum height=1.5em,minimum width=4em,fill=blue!20](node3-2) at ([yshift=-3em]node3-1.south) {\small{softamx}};
\node[anchor=south,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node1-3) at ([yshift=-4.0em]node1-2.south) {\small{Decoder}};
\node[anchor=south,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=yellow!20](node3-3) at ([yshift=-4.0em]node3-2.south) {\small{LM}};
\node[anchor=south](node1-4) at ([xshift=-0.6em,yshift=-3em]node1-3.south) {\small{$y$}};
\node[anchor=south](node3-41) at ([xshift=-0.6em,yshift=-3em]node3-3.south) {\small{$y$}};
\node[anchor=south](node3-42) at ([xshift=0.6em,yshift=-2.9em]node3-3.south) {\small{$z$}};
\node[anchor=west](node2-2) at ([xshift=-4.9em]node1-4.west) {\small{$x$}};
\node[anchor=north,draw,rounded corners,minimum height=2.2em,minimum width=4em,fill=red!20](node2-1) at ([yshift=4em]node2-2.north) {\small{Encoder}};
\node [rectangle,rounded corners,draw=red,line width=0.2mm,densely dashed,inner sep=0.4em] [fit = (node3-2) (node3-3)] (inputshadow) {};
\draw [->](node1-4.north)--([xshift=-0.6em]node1-3.south);
\draw [->](node1-3.north)--(node1-2);
\draw [->](node1-2.north)--(node1-1);
\draw [->](node2-2.north)--(node2-1);
\draw[->](node2-1.east)--(node1-3.west);
\draw [->](node3-41.north)--([xshift=-0.6em]node3-3.south);
\draw [->](node3-42.north)--([xshift=0.6em]node3-3.south);
\draw [->]([xshift=0.6em]node3-3.north)--([xshift=0.6em]node3-2.south);
\draw [->](node3-2.north)--(node3-1);
\draw[->]([xshift=-0.6em]node3-3.north)--([xshift=-0.6em,yshift=0.6em]node3-3.north)--([xshift=-3em,yshift=0.6em]node3-3.north)--([xshift=-3em,yshift=-3em]node3-3.north)--([xshift=-5.6em,yshift=-3em]node3-3.north)--([xshift=0.6em]node1-3.south);
%\draw[->](node2-1.north)--([yshift=1em]node2-1.north)--([xshift=2.5em,yshift=1em]node2-1.north)--([xshift=2.5em,yshift=-0.4em]node2-1.north)--(node1-3.west);
\end{scope}
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\tikzstyle{op} =[rounded corners=1pt,thick,minimum width=4.0em,minimum height=3.0em,draw,fill=yellow!5!white,font=\scriptsize,drop shadow]
\node [op] {
\begin{tabular}{l}
\rule{0pt}{13pt} 这里 \ \ 来举 \ \ 几个 \ \ 例子 。\\
\rule{0pt}{13pt}\ \ 必需 \ \ \ \ 装扮成 \ \ 男人 \ \ \\
\rule{0pt}{13pt} 语言 \ \ 本身\ \ 不会 \ \ 发生 \ \ 那些 \ \ 我们 \ \ 跟不上 \ \ \ \ 变化 。 \\
\rule{0pt}{13pt}\ \ \ \ \ \ \ \ 缠 着 \ \ 一条 \ \ 运动衫 。 \\
\rule{0pt}{13pt}\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ 一段时间 内 \ \ 偿还 \ \ \ \ \ \ \\
\rule{0pt}{13pt} Tom \ \ \ \ 非常感谢 \ \ \ \ \ \ 感觉 \ \ 真的 \ \ \ \ 棒极了 \ \ \\
\rule{0pt}{13pt} 这是 \ \ 非常 \ \ 非常 \ \ 重要 \ \ \\
\rule{0pt}{13pt} ... \\
\rule{0pt}{13pt} So \ \ let \ \ me \ \ give \ \ you \ \ a \ \ few \ \ examples \ \ here . \\
\rule{0pt}{13pt} She \ \ had \ \ to \ \ impersonate \ \ a \ \ man . \\
\rule{0pt}{13pt} The \ \ language \ \ is \ \ not \ \ going \ \ to \ \ change \ \ so \ \ fast \ \ that \ \ we \ \ can ’t \ \ keep \ \ up \ \ . \\
\rule{0pt}{13pt} With \ \ a \ \ sweatshirt \ \ there \ \ tied \ \ around \ \ his \ \ waist \ \ . \\
\rule{0pt}{13pt} You \ \ give \ \ them \ \ more \ \ money \ \ ; \ \ they \ \ repay \ \ you \ \ that \ \ over \ \ a \ \ time \ \ . \\
\rule{0pt}{13pt} Tom \ \ , \ \ thank \ \ you \ \ so \ \ much \ \ . \ \ It ’s \ \ been \ \ really \ \ , \ \ really \ \ great \ \ . \\
\rule{0pt}{13pt} It ’s \ \ very \ \ important \ \ . \\
\rule{0pt}{13pt} ... \\
\end{tabular}
};
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\node[rounded corners=3pt,minimum width=1.0em,minimum height=2.0em,font=\scriptsize,fill=green!5,drop shadow,thick,draw](top) at (0,0) {
\begin{tabular}{ll}
\multicolumn{2}{c}{BPE词表:} \\
errrr$<$e$>$ & tain$<$e$>$ \\
moun & est$<$e$>$ \\
high & the$<$e$>$ \\
a$<$e$>$ &
\end{tabular}
};
\node[font=\scriptsize,anchor=west] (node1) at ([xshift=0.5em,yshift=1em]top.east) {原始序列:};
\node[font=\scriptsize,anchor=west] (this) at (node1.east) {"this$<$e$>$" ,};
\node[font=\scriptsize,anchor=west] (highest) at (this.east) {"highest$<$e$>$",};
\node[font=\scriptsize,anchor=west] (mountain) at (highest.east) { "mountain$<$e$>$"};
\node[font=\scriptsize,anchor=west] (node2) at ([yshift=-1.5em]node1.south west) {BPE切分:};
\node[font=\scriptsize,anchor=west] (unk) at (node2.east) {"$<$unk$>$",};
\node[font=\scriptsize,anchor=west] (high) at (unk.east) {"high",};
\node[font=\scriptsize,anchor=west] (est) at (high.east) {"est$<$e$>$",};
\node[font=\scriptsize,anchor=west] (moun) at (est.east) {"moun",};
\node[font=\scriptsize,anchor=west] (tain) at (moun.east) {"tain$<$e$>$"};
%\draw[->,thick](node1.south) -- ([xshift=-1.0em]node2.north);
\draw[->,thick]([xshift=-0.2em]this.south) -- (unk);
\draw[->,thick](highest.south) -- (high);
\draw[->,thick](highest.south) -- (est);
\draw[->,thick](mountain.south) -- (moun);
\draw[->,thick](mountain.south) -- (tain);
\end{tikzpicture}
\ No newline at end of file
% set table width
\newcommand{\PreserveBackslash}[1]{\let\temp=\\#1\let\\=\temp}
\newcolumntype{C}[1]{>{\PreserveBackslash\centering}p{#1}}
% used for heatmap
\newcommand*{\MinNumber}{0}%
\newcommand*{\MaxNumber}{1}%
\newcommand{\ApplyGradient}[1]{%
\pgfmathsetmacro{\PercentColor}{100.0*(#1-\MinNumber)/(\MaxNumber-\MinNumber)}
\hspace{-0.33em}\colorbox{white!\PercentColor!myblack}{}
}
\newcolumntype{Q}{>{\collectcell\ApplyGradient}c<{\endcollectcell}}
\begin{center}
\renewcommand{\arraystretch}{0}
\setlength{\tabcolsep}{5mm}
\setlength{\fboxsep}{2.2mm} % box size
\begin{tabular}{C{.20\textwidth}C{.20\textwidth}C{.20\textwidth}C{.20\textwidth}}
\setlength{\tabcolsep}{0pt}
\subfigure [\footnotesize{Self-Attention}] {
\begin{tabular}{cc}
\setlength{\tabcolsep}{0pt}
~
&
\begin{tikzpicture}
\begin{scope}
\node [inner sep=1.5pt] (w1) at (0,0) {\small{$1$} };
\foreach \x/\y/\z in {2/1/$2$, 3/2/$3$, 4/3/$4$, 5/4/$5$, 6/5/$6$}
{
\node [inner sep=1.5pt,anchor=south west] (w\x) at ([xshift=1.15em]w\y.south west) {\small{\z} };
}
\end{scope}
\end{tikzpicture}
\\
\renewcommand\arraystretch{1}
\begin{tabular}{c}
\setlength{\tabcolsep}{0pt}
\small{$1\ \ $} \\
\small{$2\ $} \\
\small{$3\ $} \\
\small{$4\ $} \\
\small{$5\ $} \\
\small{$6\ $} \\
\end{tabular}
&
%\setlength{\tabcolsep}{0pt}
\begin{tabular}{*{6}{Q}}
0.0000 & 0.5429 & 0.5138 & 0.4650 & 0.5005 & 0.5531 \\
0.5429 & 0.0000 & 0.0606 & 0.0630 & 0.0703 & 0.0332 \\
0.5138 & 0.0606 & 0.0000 & 0.0671 & 0.0472 & 0.0296 \\
0.4650 & 0.0630 & 0.0671 & 0.0000 & 0.0176 & 0.0552 \\
0.5005 & 0.0703 & 0.0472 & 0.0176 & 0.0000 & 0.0389 \\
0.5531 & 0.0332 & 0.0296 & 0.0552 & 0.0389 & 0.0000 \\
\end{tabular}
\end{tabular}
}
&
\subfigure [\footnotesize{Enc-Dec Attention}] {
\setlength{\tabcolsep}{0pt}
\begin{tabular}{cc}
\setlength{\tabcolsep}{0pt}
~
&
\begin{tikzpicture}
\begin{scope}
\node [inner sep=1.5pt] (w1) at (0,0) {\small{$1$} };
\foreach \x/\y/\z in {2/1/$2$, 3/2/$3$, 4/3/$4$, 5/4/$5$, 6/5/$6$}
{
\node [inner sep=1.5pt,anchor=south west] (w\x) at ([xshift=1.15em]w\y.south west) {\small{\z} };
}
\end{scope}
\end{tikzpicture}
\\
\renewcommand\arraystretch{1}
\begin{tabular}{c}
\setlength{\tabcolsep}{0pt}
\small{$1\ \ $} \\
\small{$2\ $} \\
\small{$3\ $} \\
\small{$4\ $} \\
\small{$5\ $} \\
\small{$6\ $} \\
\end{tabular}
&
%\setlength{\tabcolsep}{0pt}
\begin{tabular}{*{6}{Q}}
0.0000 & 0.0175 & 0.2239 & 0.3933 & 0.7986 & 0.3603 \\
0.0175 & 0.0000 & 0.1442 & 0.3029 & 0.7295 & 0.3324 \\
0.2239 & 0.1442 & 0.0000 & 0.0971 & 0.6270 & 0.4163 \\
0.3933 & 0.3029 & 0.0971 & 0.0000 & 0.2385 & 0.2022 \\
0.7986 & 0.7295 & 0.6270 & 0.2385 & 0.0000 & 0.0658 \\
0.3603 & 0.3324 & 0.4163 & 0.2022 & 0.0658 & 0.0000 \\
\end{tabular}
\end{tabular}
}
\end{tabular}
\end{center}
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{axis}
[
width=5cm, height=3.5cm,
xtick={15,17,19,21,23,25},
ytick={6.0,6.5,7.0},
xlabel={\scriptsize Epoch},
ylabel={},
ylabel style={},
x tick label style={},
y tick label style={},
tick align=inside,
legend style={anchor=north,xshift=1.7cm,yshift=1cm,legend columns =-1},
ymin=5.7,
ymax=7.3,
xmin=14.6,
xmax=25.4,
extra y ticks={6.0,6.5,7.0},
extra y tick labels={3.7,3.8,3.9},
extra y tick style={ticklabel pos=right}]
\addplot [sharp plot,very thick,red!60,mark=diamond*] coordinates{(15,6.75) (16,6.73) (17,6.70) (18,6.67) (19,6.64) (20,6.61) (21,6.59) (22,6.58) (23,6.57) (24,6.58) (25,6.59)};
\addplot [sharp plot,very thick,purple!60,mark=triangle*] coordinates{(15,6.70) (16,6.4) (17,6.20) (18,6.30) (19,6.20) (20,6.10) (21,6.15) (22,6.10) (23,6.15) (24,6.16) (25,6.17)};
\legend{\scriptsize {训练集},\scriptsize{校验集}}
\end{axis}
\begin{axis}
[ xshift=6.6cm,
width=5cm, height=3.5cm,
xtick={15,17,19,21,23,25},
ytick={5.0,5.5,6.0},
xlabel={\scriptsize Epoch},
ylabel={},
ylabel style={},
x tick label style={},
y tick label style={},
tick align=inside,
ymin=4.7,
ymax=6.3,
xmin=14.6,
xmax=25.4,
extra y ticks={5.0,5.5,6.0},
extra y tick labels={3.5,3.6,3.7},
extra y tick style={ticklabel pos=right}]
\addplot [sharp plot,very thick,red!60,mark=diamond*] coordinates{(15,5.7) (16,5.65) (17,5.6) (18,5.55) (19,5.5) (20,5.45) (21,5.4) (22,5.38) (23,5.36) (24,5.34) (25,5.27)};
\addplot [sharp plot,very thick,purple!60,mark=triangle*] coordinates{(15,5.0) (16,4.9) (17,4.9) (18,5.05) (19,4.9) (20,5.0) (21,5.0) (22,5.1) (23,5.0) (24,5.15) (25,5.5)};
\end{axis}
\node [anchor=north,rotate=90] (n1) at (-1.3cm,1cm) {\scriptsize 训练集\ PPL};
\node [anchor=north,rotate=90] (n2) at (5.4cm,1cm) {\scriptsize 训练集\ PPL};
\node [anchor=north,rotate=90] (n3) at (4.2cm,1cm) {\scriptsize 校验集\ PPL};
\node [anchor=north,rotate=90] (n4) at (10.7cm,1cm) {\scriptsize 校验集\ PPL};
\end{tikzpicture}
%---------------------------------------------------------------------
\ No newline at end of file
\begin{center}
\centerline{以英语为例:}
\vspace{0.5em}
\begin{tikzpicture}
\node[rounded corners=3pt,minimum width=10.0em,minimum height=2.0em,draw,thick,fill=green!5,font=\scriptsize,drop shadow,inner sep=0.5em] (left) at (0,0) {
\begin{tabular}{c}
名词\\
\rule{0pt}{12pt}cat,cats 、watch,watches\\
\rule{0pt}{12pt}baby,babies、wife,wives\\
\end{tabular}
};
\node[rounded corners=3pt,minimum width=10.0em,minimum height=2.0em,draw,thick,fill=green!5,font=\scriptsize,drop shadow,inner sep=0.5em] (right) at ([xshift=8em]left.east) {
\begin{tabular}{c}
动词\\
\rule{0pt}{12pt}do,did ,does,doing,done\\
\rule{0pt}{12pt}have,had,has,having\\
\end{tabular}
};
\end{tikzpicture}
\end{center}
\ No newline at end of file
\begin{tikzpicture}
\node[] (do) at (0,0) {{\red do}};
\node[anchor = west] (does) at ([xshift = 1em]do.east) {{\red do}es};
\node[anchor = west] (doing) at ([xshift = 0.7em]does.east) {{\red do}ing};
\node[anchor = north] (do_root) at ([yshift = -1em]does.south) {do};
\node[anchor = west] (new) at ([xshift = 2em]doing.east) {{\red new}};
\node[anchor = west] (newer) at ([xshift = 1em]new.east) {{\red new}er};
\node[anchor = west] (newest) at ([xshift = 0.7em]newer.east) {{\red new}est};
\node[anchor = north] (new_root) at ([yshift = -1em]newer.south) {new};
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
\draw [->] (do_root.north) -- (does.south);
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
\draw [->] (new_root.north) -- (newer.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
\end{tikzpicture}
\ No newline at end of file
\begin{tikzpicture}
\begin{scope}
\node [anchor=north west] (pos1) at (0,0) {$\circ$};
\node [anchor= west] (pos2) at ([xshift=3.0em]pos1.east){$\circ$};
\node [anchor= west] (pos1-2) at ([xshift=1.0em,yshift=1.0em]pos1.east){I};
\draw[->,thick](pos1.east)--(pos2.west);
\node [anchor= west] (pos3) at ([xshift=3.0em]pos2.east){$\circ$};
\node [anchor= west] (pos2-2) at ([xshift=0.1em,yshift=1.0em]pos2.east){have};
\draw[->,thick](pos2.east)--(pos3.west);
\end{scope}
\begin{scope}[yshift=-4.0em]
\node [anchor=north west] (pos1) at (0,0) {$\circ$};
\node [anchor= west] (pos2) at ([xshift=3.0em]pos1.east){$\circ$};
\node [anchor= west] (pos1-2) at ([xshift=0.5em,yshift=1.0em]pos1.east){He};
\draw[->,thick](pos1.east)--(pos2.west);
\node [anchor= west] (pos3) at ([xshift=3.0em]pos2.east){$\circ$};
\node [anchor= west] (pos2-2) at ([xshift=0.1em,yshift=1.0em]pos2.east){has};
\draw[->,thick](pos2.east)--(pos3.west);
\node [anchor= west] (pos4) at ([xshift=5.0em]pos3.east){$\circ$};
\node [anchor= west] (pos5) at ([xshift=5.0em]pos4.east){$\circ$};
\node [anchor= west] (pos6) at ([xshift=5.0em]pos5.east){$\circ$};
\node [anchor= west] (word1) at ([xshift=2.0em,yshift=2.7em]pos4.east){I};
\node [anchor= west] (word2) at ([xshift=1.5em,yshift=-1.6em]pos4.east){He};
\node [anchor= west] (word3) at ([xshift=1.4em,yshift=-3em]pos4.east){She};
\node [anchor= west] (word4) at ([xshift=1.1em,yshift=2.8em]pos5.east){Have};
\node [anchor= west] (word5) at ([xshift=1.3em,yshift=-2.8em]pos5.east){Has};
\begin{pgfonlayer}{background}
{
% I
\draw [->,thick] (pos4.north) .. controls +(north:0.8) and +(north:0.8) .. (pos5.north);
% He
\draw [->,thick] (pos4.south) .. controls +(south:0.8) and +(south:0.8) .. (pos5.south);
% She
\draw [->,thick] (pos4.south) .. controls +(south:1.5) and +(south:1.5) .. (pos5.south);
% Have
\draw [->,thick] (pos5.north) .. controls +(north:0.8) and +(north:0.8) .. (pos6.north);
% Has
\draw [->,thick] (pos5.south) .. controls +(south:0.8) and +(south:0.8) .. (pos6.south);
}
\end{pgfonlayer}
\end{scope}
\begin{scope}[yshift=-8.0em]
\node [anchor=north west] (pos1) at (0,0) {$\circ$};
\node [anchor= west] (pos2) at ([xshift=3.0em]pos1.east){$\circ$};
\node [anchor= west] (pos1-2) at ([xshift=0.4em,yshift=1.0em]pos1.east){She};
\draw[->,thick](pos1.east)--(pos2.west);
\node [anchor= west] (pos3) at ([xshift=3.0em]pos2.east){$\circ$};
\node [anchor= west] (pos2-2) at ([xshift=0.1em,yshift=1.0em]pos2.east){has};
\draw[->,thick](pos2.east)--(pos3.west);
\end{scope}
\end{tikzpicture}
%---------------------------------------------------------------------
\begin{center}
\begin{tikzpicture}
\footnotesize{
\begin{axis}[
width=.40\textwidth,
height=.30\textwidth,
legend style={at={(0.60,0.08)}, anchor=south west},
xlabel={\footnotesize{更新次数(10k)}},
ylabel={\footnotesize{学习率 (\scriptsize{$10^{-3}$}}},
ylabel style={yshift=-1em},xlabel style={yshift=0.0em},
yticklabel style={/pgf/number format/precision=2,/pgf/number format/fixed zerofill},
ymin=0,ymax=2.2, ytick={0.5, 1, 1.5, 2},
xmin=0,xmax=5,xtick={1,2,3,4},
legend style={xshift=-8pt,yshift=-4pt, legend plot pos=right,font=\scriptsize,cells={anchor=west}}
]
\addplot[red,line width=1.25pt] coordinates {(0,0) (1.6,2) (1.8,1.888) (2,1.787) (2.5,1.606) (3,1.462) (3.5,1.3549) (4,1.266) (4.5,1.193) (5,1.131)};
\addlegendentry{\scriptsize Base48}
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (10000,0.00179) (12000,0.00163) (12950,0.001572)};
\addplot[blue,line width=1.25pt] coordinates {(0,0) (0.8,2) (0.9906,1.7983)};
%\addplot[red,line width=1.25pt] coordinates {(0,0) (8000,0.002) (9906,0.0017983)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(0.9906,1.7983) (0.9906,2)};
\addplot[blue,line width=1.25pt] coordinates {(0.9906,2) (1.1906,1.79) (1.3906,1.63) (1.4856,1.572)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(1.4856,1.572) (1.4856,2)};
\addplot[blue,line width=1.25pt] coordinates {(1.4856,2) (1.6856,1.79) (1.8856,1.63) (1.9806,1.572)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(1.9806,1.572) (1.9806,2)};
\addplot[blue,line width=1.25pt] coordinates {(1.9806,2) (2.1806,1.79) (2.3806,1.63) (2.4756,1.572)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(2.4756,1.572) (2.4756,2)};
\addplot[blue,line width=1.25pt] coordinates {(2.4756,2) (2.6756,1.79) (2.8756,1.63) (2.9706,1.572)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(2.9706,1.572) (2.9706,2)};
\addplot[blue,line width=1.25pt] coordinates {(2.9706,2) (3.1706,1.79) (3.3706,1.63) (3.4656,1.572) (3.6706,1.4602) (3.7136,1.44)};
\addplot[blue,dashed,line width=1.25pt] coordinates {(3.7136,1.44) (3.7136,2)};
\addplot[blue,line width=1.25pt] coordinates {(3.7136,2) (3.9136,1.79) (4.1136,1.63) (4.2086,1.572) (4.4136,1.4602) (4.4566,1.44) (4.7000,1.3574) (5.0000,1.2531)};
\addlegendentry{\scriptsize SDT48}
\end{axis}
}
\end{tikzpicture}
\end{center}
\ No newline at end of file
%%%------------------------------------------------------------------------------------------------------------
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=east,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s11) at (-0.5em, 0) {$\times h$};
\node [rectangle,anchor=west,fill=blue!20,draw=blue,rounded corners=3pt,minimum height=1.4em,minimum width=1.5em] (s12) at ([xshift=1.5em]s11.east) {};
\node [anchor=north,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s21) at ([yshift=-1.8em]s11.south) {$\times h$};
\node [anchor=west,fill=orange!20,draw=red,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em,dashed] (s22) at ([xshift=1.5em]s21.east) {$\times h$};
\node [anchor=west,fill=blue!20,draw=blue,rounded corners=3pt,minimum height=1.4em,minimum width=1.5em] (s23) at ([xshift=1.5em]s22.east) {};
\node [anchor=north,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s31) at ([yshift=-1.8em]s21.south) {$\times h$};
\node [anchor=west,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s32) at ([xshift=1.5em]s31.east) {$\times h$};
\node [anchor=west,fill=orange!20,draw=red,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em,dashed] (s33) at ([xshift=1.5em]s32.east) {$\times h$};
\node [anchor=west,fill=blue!20,draw=blue,rounded corners=3pt,minimum height=1.4em,minimum width=1.5em] (s34) at ([xshift=1.5em]s33.east) {};
\node [anchor=north,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s41) at ([yshift=-1.8em]s31.south) {$\times h$};
\node [anchor=west,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s42) at ([xshift=1.5em]s41.east) {$\times h$};
\node [anchor=west,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s43) at ([xshift=1.5em]s42.east) {$\times h$};
\node [anchor=west,fill=orange!20,draw=red,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em,dashed] (s44) at ([xshift=1.5em]s43.east) {$\times h$};
\node [anchor=west,fill=blue!20,draw=blue,rounded corners=3pt,minimum height=1.4em,minimum width=1.5em] (s45) at ([xshift=1.5em]s44.east) {};
\node [anchor=east] (p1) at ([xshift=-2em]s11.west) {step 1};
\node [anchor=east] (p2) at ([xshift=-2em]s21.west) {step 2};
\node [anchor=east] (p3) at ([xshift=-2em]s31.west) {step 3};
\node [anchor=east] (p4) at ([xshift=-2em]s41.west) {step 4};
\node [anchor=south,fill=orange!20,draw=orange,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (b1) at ([xshift=-0.2em,yshift=2em]p1.north) {};
\node [anchor=west] (b2) at (b1.east) {:编码器};
\node [anchor=west,fill=blue!20,draw=blue,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (b3) at ([xshift=1em]b2.east) {};
\node [anchor=west] (b4) at (b3.east) {:解码器};
\node [anchor=west] (b5) at ([xshift=2.5em]b4.east) {:拷贝};
\draw[-latex,thick,red,dashed] ([xshift=0.2em]b4.east) -- (b5.west);
\draw [-latex, line width=0.8pt] ([xshift=-1.5em]s11.west) -- (s11.west);
\draw [-latex, line width=0.8pt] (s11.east) -- (s12.west);
\draw [-latex, line width=0.8pt] (s12.east) -- ([xshift=1.5em]s12.east);
\draw [-latex, line width=0.8pt] ([xshift=-1.5em]s21.west) -- (s21.west);
\draw [-latex, line width=0.8pt] (s21.east) -- (s22.west);
\draw [-latex, line width=0.8pt] (s22.east) -- (s23.west);
\draw [-latex, line width=0.8pt] (s23.east) -- ([xshift=1.5em]s23.east);
\draw [-latex, line width=0.8pt] ([xshift=-1.5em]s31.west) -- (s31.west);
\draw [-latex, line width=0.8pt] (s31.east) -- (s32.west);
\draw [-latex, line width=0.8pt] (s32.east) -- (s33.west);
\draw [-latex, line width=0.8pt] (s33.east) -- (s34.west);
\draw [-latex, line width=0.8pt] (s34.east) -- ([xshift=1.5em]s34.east);
\draw [-latex, line width=0.8pt] ([xshift=-1.5em]s41.west) -- (s41.west);
\draw [-latex, line width=0.8pt] (s41.east) -- (s42.west);
\draw [-latex, line width=0.8pt] (s42.east) -- (s43.west);
\draw [-latex, line width=0.8pt] (s43.east) -- (s44.west);
\draw [-latex, line width=0.8pt] (s44.east) -- (s45.west);
\draw [-latex, line width=0.8pt] (s45.east) -- ([xshift=1.5em]s45.east);
\draw[-latex,thick,red,dashed] (s11.south)..controls +(south:1em) and +(north:1.2em)..(s22.north);
\draw[-latex,thick,red,dashed] (s22.south)..controls +(south:1em) and +(north:1.2em)..(s33.north);
\draw[-latex,thick,red,dashed] (s33.south)..controls +(south:1em) and +(north:1.2em)..(s44.north);
\end{scope}
\end{tikzpicture}
\end{center}
%%%------------------------------------------------------------------------------------------------------------
%%% 短语系统的问题 - 一个实例
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=east,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s11) at (-0.5em, 0) {};
\node [rectangle,anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s12) at ([xshift=2em]s11.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s13) at ([xshift=2em]s12.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s14) at ([xshift=2em]s13.east) {};
\node [anchor=north,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s21) at ([yshift=-2.5em]s11.south) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s22) at ([xshift=2em]s21.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s23) at ([xshift=2em]s22.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s24) at ([xshift=2em]s23.east) {};
\node [anchor=north,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s31) at ([yshift=-2.5em]s21.south) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s32) at ([xshift=2em]s31.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s33) at ([xshift=2em]s32.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s34) at ([xshift=2em]s33.east) {};
\node [anchor=north,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s41) at ([yshift=-2.5em]s31.south) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s42) at ([xshift=2em]s41.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s43) at ([xshift=2em]s42.east) {};
\node [anchor=west,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (s44) at ([xshift=2em]s43.east) {};
\node [anchor=east] (p1) at ([xshift=-3.5em]s11.west) {$p=\infty$};
\node [anchor=east] (p2) at ([xshift=-4em]s21.west) {$p=1$};
\node [anchor=east] (p3) at ([xshift=-4em]s31.west) {$p=2$};
\node [anchor=east] (p4) at ([xshift=-4em]s41.west) {$p=4$};
\node [anchor=north] (p5) at ([yshift=-1em]p3.south) {$\cdots$};
\node [anchor=south,fill=orange!20,draw,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em] (b1) at ([xshift=-0.6em,yshift=2em]p1.north) {};
\node [anchor=west] (b2) at (b1.east) {:Layer};
\node [anchor=west,draw=red,rounded corners=3pt,minimum height=1.4em,minimum width=1.4em,dashed,line width=0.8pt] (b3) at ([xshift=1em]b2.east) {};
\node [anchor=west] (b4) at (b3.east) {:Block};
\draw [-latex, line width=0.8pt] ([xshift=-2em]s11.west) -- (s11.west);
\draw [-latex, line width=0.8pt] (s11.east) -- (s12.west);
\draw [-latex, line width=0.8pt] (s12.east) -- (s13.west);
\draw [-latex, line width=0.8pt] (s13.east) -- (s14.west);
\draw [-latex, line width=0.8pt] (s14.east) -- ([xshift=2em]s14.east);
\draw [-latex, line width=0.8pt] ([xshift=-2em]s21.west) -- (s21.west);
\draw [-latex, line width=0.8pt] (s21.east) -- (s22.west);
\draw [-latex, line width=0.8pt] (s22.east) -- (s23.west);
\draw [-latex, line width=0.8pt] (s23.east) -- (s24.west);
\draw [-latex, line width=0.8pt] (s24.east) -- ([xshift=2em]s24.east);
\draw [-latex, line width=0.8pt] ([xshift=-2em]s31.west) -- (s31.west);
\draw [-latex, line width=0.8pt] (s31.east) -- (s32.west);
\draw [-latex, line width=0.8pt] (s32.east) -- (s33.west);
\draw [-latex, line width=0.8pt] (s33.east) -- (s34.west);
\draw [-latex, line width=0.8pt] (s34.east) -- ([xshift=2em]s34.east);
\draw [-latex, line width=0.8pt] ([xshift=-2em]s41.west) -- (s41.west);
\draw [-latex, line width=0.8pt] (s41.east) -- (s42.west);
\draw [-latex, line width=0.8pt] (s42.east) -- (s43.west);
\draw [-latex, line width=0.8pt] (s43.east) -- (s44.west);
\draw [-latex, line width=0.8pt] (s44.east) -- ([xshift=2em]s44.east);
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=1.7em,dashed,line width=0.8pt] (x21) at (s21) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=1.7em,dashed,line width=0.8pt] (x22) at (s22) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=1.7em,dashed,line width=0.8pt] (x23) at (s23) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=1.7em,dashed,line width=0.8pt] (x24) at (s24) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=5.2em,dashed,line width=0.8pt] (x31) at ([xshift=1.75em]s31) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=5.2em,dashed,line width=0.8pt] (x32) at ([xshift=1.75em]s33) {};
\node [draw=red,rounded corners=3pt,minimum height=1.7em,minimum width=12.2em,dashed,line width=0.8pt] (x41) at ([xshift=1.75em]s42) {};
{
\draw [-latex, line width=0.8pt] ([xshift=-1em]s21.west).. controls +(58:0.6) and +(122:0.6) .. ([xshift=1em]s21.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s22.west).. controls +(58:0.6) and +(122:0.6) .. ([xshift=1em]s22.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s23.west).. controls +(58:0.6) and +(122:0.6) .. ([xshift=1em]s23.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s24.west).. controls +(58:0.6) and +(122:0.6) .. ([xshift=1em]s24.east);
}
{
\draw [-latex, line width=0.8pt] ([xshift=-1em]s21.west).. controls +(65:0.8) and +(115:0.8) .. ([xshift=1em]s22.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s22.west).. controls +(65:0.8) and +(115:0.8) .. ([xshift=1em]s23.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s23.west).. controls +(65:0.8) and +(115:0.8) .. ([xshift=1em]s24.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s31.west).. controls +(65:0.8) and +(115:0.8) .. ([xshift=1em]s32.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s33.west).. controls +(65:0.8) and +(115:0.8) .. ([xshift=1em]s34.east);
}
{
\draw [-latex, line width=0.8pt] ([xshift=-1em]s21.west).. controls +(70:1.0) and +(110:1.0) .. ([xshift=1em]s23.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s22.west).. controls +(70:1.0) and +(110:1.0) .. ([xshift=1em]s24.east);
}
{
\draw [-latex, line width=0.8pt] ([xshift=-1em]s21.west).. controls +(75:1.2) and +(105:1.2) .. ([xshift=1em]s24.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s31.west).. controls +(75:1.2) and +(105:1.2) .. ([xshift=1em]s34.east);
\draw [-latex, line width=0.8pt] ([xshift=-1em]s41.west).. controls +(75:1.2) and +(105:1.2) .. ([xshift=1em]s44.east);
}
\end{scope}
\end{tikzpicture}
\end{center}
......@@ -8,13 +8,126 @@
%----------------------------------------------------------------------------------------
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{chapter_head_1} % Chapter heading image
\chapterimage{fig-NEU-1.jpg} % Chapter heading image
%------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%第一章附录
\begin{appendices}
\chapter{附录A}
\label{appendix-A}
\parinterval 在构建机器翻译系统的过程中,数据是必不可少的,尤其是现在主流的神经机器翻译系统,系统的性能往往受限于语料库规模和质量。所幸的是,随着语料库语言学的发展,一些主流语种的相关语料资源已经十分丰富。
\parinterval 为了方便读者进行相关研究,我们汇总了几个常用的基准数据集,这些数据集已经在机器翻译领域中被广泛使用,有很多之前的相关工作可以进行复现和对比。同时,我们收集了一下常用的平行语料,方便读者进行一些探索。
%%%%%%%%%%%%%%%%%%%%%
\section{基准数据集}
%----------------------------------------------
% 表1.1-1
\begin{table}[htp]{
\footnotesize
\begin{center}
\caption{基准数据集}
\label{tab:Reference-data-set}
\begin{tabular}{p{1.6cm} | p{1.2cm} p{1.6cm} p{2.6cm} p{3.9cm}}
{任务} & {语种} &{领域} &{描述} &{数据集地址} \\
\hline
\rule{0pt}{15pt}WMT & En Zh& 新闻、医学 & 以英语为核心的多& {http://www.statmt.org/wmt19/} \\
& De Ru等 & 、翻译 & 语种机器翻译数据 & \\
& & & 集,涉及多种任务 & \\
\rule{0pt}{15pt}IWSLT & En De Fr & 口语翻译 & 文本翻译数据集来 & {https://wit3.fbk.eu/} \\
& Cs Zh等 & &自TED演讲,数 & \\
& & & 据规模较小 & \\
\rule{0pt}{15pt}NIST & Zh-En等 & 新闻翻译 & 评测集包括4句参 & {https://www.ldc.upenn.edu/coll} \\
& Cs Zh等 & & 考译文,质量较高 & aborations/evaluations/nist \\
\end{tabular}
\end{center}
}\end{table}
%-------------------------------------------
%----------------------------------------------
% 表1.1-2
\begin{table}[htp]{
\footnotesize
\begin{center}
\begin{tabular}{p{1.6cm} | p{1.2cm} p{1.6cm} p{2.6cm} p{3.9cm}}
\rule{0pt}{15pt}{任务} & {语种} &{领域} &{描述} &{数据集地址} \\
\hline
\rule{0pt}{15pt}TVsub & Zh-En & 字幕翻译 & 数据抽取自电视剧 & {https://github.com/longyuewan} \\
& & & 字幕,用于对话中 & gdcu/tvsub \\
& & & 长距离上下文研究 & \\
\rule{0pt}{15pt}Flickr30K & En-De & 多模态翻译 & 31783张图片,每 & {http://shannon.cs.illinois.edu/D} \\
& & & 张图片5个语句标 & enotationGraph/ \\
& & && \\
\rule{0pt}{15pt}Multi30K & En-De & 多模态翻译 & 31014张图片,每 & {http://www.statmt.org/wmt16/} \\
& En-Fr & & 张图片5个语句标 & multimodal-task.html \\
& & && \\
\rule{0pt}{15pt}IAPRTC-12 & En-De & 多模态翻译 & 20000张图片及对 & {https://www.imageclef.org} \\
& & & 应标注 & /photodata \\
\rule{0pt}{15pt}IKEA & En-De & 多模态翻译 & 3600张图片及对应 & {https://github.com/sampalomad} \\
& En-Fr & & 标注 & /IKEA-Dataset.git \\
\end{tabular}
\end{center}
}\end{table}
%-------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{平行语料}
\parinterval 神经机器翻译系统的训练需要大量的双语数据,这里我们汇总了一些公开的平行语料,方便读者获取。
\vspace{0.5em}
\begin{itemize}
\item News Commentary Corpus:包括汉语、英语等12个语种,64个语言对的双语数据,爬取自Project Syndicate网站的政治、经济评论。URL:\url{http://www.casmacat.eu/corpus/news-commentary.html}
\vspace{0.5em}
\item CWMT Corpus:中国计算机翻译研讨会社区收集和共享的中英平行语料,涵盖多种领域,例如新闻、电影字幕、小说和政府文档等。URL:\url{http://nlp.nju.edu.cn/cwmt-wmt/}
\vspace{0.5em}
\item Common Crawl corpus:包括捷克语、德语、俄语、法语4种语言到英语的双语数据,爬取自互联网网页。URL:\url{http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz}
\vspace{0.5em}
\item Europarl Corpus:包括保加利亚语、捷克语等20种欧洲语言到英语的双语数据,来源于欧洲议会记录。URL:\url{http://www.statmt.org/europarl/}
\vspace{0.5em}
\item ParaCrawl Corpus:包括23种欧洲语言到英语的双语语料,数据来源于网络爬取。URL:\url{https://www.paracrawl.eu/index.php}
\vspace{0.5em}
\item United Nations Parallel Corpus:包括阿拉伯语、英语、西班牙语、法语、俄语、汉语6种联合国正式语言,30种语言对的双语数据,来源自联合国公共领域的官方记录和其他会议文件。URL:\url{https://conferences.unite.un.org/UNCorpus/}
\vspace{0.5em}
\item TED Corpus:TED大会演讲在其网站公布了自2007年以来的演讲字幕,以及超过100种语言的翻译版本。WIT收集整理了这些数据,以方便科研工作者使用,同时,会为每年的IWSLT评测比赛提供评测数据集。URL:\url{https://wit3.fbk.eu/}
\vspace{0.5em}
\item OpenSubtile:由P. Lison和J. Tiedemann收集自opensubtiles电影字幕网站,包含62种语言、1782个语种对的平行语料,资源相对比较丰富。URL:\url{http://opus.nlpl.eu/OpenSubtitles2018.php}
\vspace{0.5em}
\item Wikititles Corpus:包括古吉拉特语等14个语种,11个语言对的双语数据,数据来源自维基百科的标题。URL:\url{http://data.statmt.org/wikititles/v1/}
\vspace{0.5em}
\item CzEng:捷克语和英语的平行语料,数据来源于欧洲法律、信息技术和小说领域。URL:\url{ http://ufal.mff.cuni.cz/czeng/czeng17}
\vspace{0.5em}
\item Yandex Corpus:俄语和英语的平行语料,爬取自互联网网页。URL:\url{https://translate.yandex.ru/corpus}
\vspace{0.5em}
\item Tilde MODEL Corpus:欧洲语言的多语言开放数据,包含多个数据集,数据来自于经济、新闻、政府、旅游等门户网站。URL:\url{https://tilde-model.s3-eu-west-1.amazonaws.com/Tilde_MODEL_Corpus.html}
\vspace{0.5em}
\item Setimes Corpus:包括克罗地亚语、阿尔巴尼亚等9种巴尔干语言,72种个语言对的双语数据,来源于东南欧时报的新闻报道。URL:\url{http://www.statmt.org/setimes/}
\vspace{0.5em}
\item TVsub:收集自电视剧集字幕的中英文对话语料库,包含超过200万的句对,可用于对话领域和长距离上下文信息的研究。URL:\url{https://github.com/longyuewangdcu/tvsub}
\vspace{0.5em}
\item Recipe Corpus:由Cookpad公司创建的日英食谱语料库,包含10万多的句对。URL:\url{http://lotus.kuee.kyoto-u.ac.jp/WAT/recipe-corpus/}
\end{itemize}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{相关工具}
\subsection{数据预处理工具}
\parinterval 数据处理是搭建神经机器翻译系统的重要步骤,这里我们提供了一些开源工具供读者进行使用。
\vspace{0.5em}
\begin{itemize}
\item Moses:Moses 提供了很多数据预处理的脚本和工具,被机器翻译研究者广泛使用。其中包括符号标准化、分词、大小写转换和长度过滤等。URL:\url{https://github.com/moses-smt/mosesdecoder/tree/master/scripts}
\vspace{0.5em}
\item Jieba:常用的中文分词工具。URL:\url{https://github.com/fxsjy/jieba}
\vspace{0.5em}
\item Subword-nmt:基于BPE算法的子词切分工具。URL:\url{https://github.com/rsennrich/subword-nmt}
\end{itemize}
\subsection{评价工具}
\parinterval 机器翻译领域已经有多种自动评价指标,包括BLEU、TER和METEOR等,这里我们提供了一些自动评价指标的工具,方便读者使用。
\vspace{0.5em}
\begin{itemize}
\item Moses:其中包括了通用的BLEU评测脚本。URL:\url{https://github.com/moses-smt/mosesdecoder/tree/master/scripts/generic}
\vspace{0.5em}
\item Tercom:自动评价指标TER的计算工具,只有java版本。URL:\url{http://www.cs.umd.edu/~snover/tercom/}
\vspace{0.5em}
\item Meteor:自动评价指标METEOR的实现。URL:\url{https://www.cs.cmu.edu/~alavie/METEOR/}
\end{itemize}
\end{appendices}
......
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
\newpage
%% Saved with string encoding Western (ASCII)
@article{Aho-Ullman:1969:JCSS,
author = {Aho, Alfred V. and Ullman, Jeffrey D.},
title = {Syntax directed translations and the pushdown assembler},
journal = {Journal of Computer and System Sciences},
pages = {37--57},
volume = {3},
year = {1969}
}
@article{alshawi-EtAl:2000:CL,
author = {Alshawi, Hiyan and Bangalore, Srinivas and Douglas, Shona},
title = {Learning dependency translation models as collections of finite state head transducers},
journal = {Computational Linguistics},
mumber = {1},
pages = {45--60},
volume = {26},
year = {2000}
}
@article{brown-EtAl:1993:CL,
author = {Brown, Peter E. and Pietra, Stephen A. Della and Pietra, Vincent J. Della and Mercer, Robert L.},
title = {The Mathematics of Statistical Machine Translation: Parameter Estimation },
journal = {Computational Linguistics},
mumber = {2},
pages = {263--311},
volume = {19},
year = {1993}
}
@article{bergert-EtAl:1996:CL,
author = {Bergert, Adam L. and Pietra, Vincent J. Della and Pietra, Stephen A. Della},
title = {A maximum entropy approach to natural language processing},
journal = {Computational Linguistics},
mumber = {1},
pages = {39--71},
volume = {22},
year = {1996}
}
@InProceedings{cer-EtAl:2010:DEMO,
author = {Cer, Daniel and Galley, Michel and Jurafsky, Daniel and Manning, Christopher D.},
title = {Phrasal: A Statistical Machine Translation Toolkit for Exploring New Model Features},
booktitle = {Proceedings of the NAACL HLT 2010 Demonstration Session},
month = {June},
year = {2010},
address = {Los Angeles, California},
publisher = {Association for Computational Linguistics},
pages = {9--12},
url = {http://www.aclweb.org/anthology/N10-2003}
}
@InProceedings{chiang:2005:ACL,
author = {Chiang, David},
title = {A Hierarchical Phrase-Based Model for Statistical Machine Translation},
booktitle = {Proceedings of the 43rd Annual Meeting of the Association for Computational Linguistics (ACL'05)},
month = {June},
year = {2005},
address = {Ann Arbor, Michigan},
publisher = {Association for Computational Linguistics},
pages = {263--270},
url = {http://www.aclweb.org/anthology/P05-1033},
doi = {10.3115/1219840.1219873}
}
@InProceedings{chiang-kevin:2006:ACL,
author = {Chiang, David and Knight, Kevin},
title = {An introduction to synchronous grammars},
booktitle = {Proceedings of the 44rd Annual Meeting of the Association for Computational Linguistics (ACL'05)},
year = {2006},
publisher = {Association for Computational Linguistics}
}
@article{chiang:2007:CL,
author = {David Chiang},
title = {Hierarchical Phrase-Based Translation},
journal = {Computational Linguistics},
mumber = {2},
pages = {45--60},
volume = {33},
year = {2007}
}
@InProceedings{chiang-EtAl:2008:EMNLP,
author = {Chiang, David and DeNeefe, Steve and Chan, Yee Seng and Ng, Hwee Tou},
title = {Decomposability of Translation Metrics for Improved Evaluation and Efficient Algorithms},
booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing},
month = {October},
year = {2008},
address = {Honolulu, Hawaii},
publisher = {Association for Computational Linguistics},
pages = {610--619},
url = {http://www.aclweb.org/anthology/D08-1064}
}
@InProceedings{denero-EtAl:2010:NAACLHLT,
author = {DeNero, John and Kumar, Shankar and Chelba, Ciprian and Och, Franz},
title = {Model Combination for Machine Translation},
booktitle = {Human Language Technologies: The 2010 Annual Conference of the North American Chapter of the Association for Computational Linguistics},
month = {June},
year = {2010},
address = {Los Angeles, California},
publisher = {Association for Computational Linguistics},
pages = {975--983},
url = {http://www.aclweb.org/anthology/N10-1141}
}
@InProceedings{dyer-EtAl:2010:Demos,
author = {Dyer, Chris and Lopez, Adam and Ganitkevitch, Juri and Weese, Jonathan and Ture, Ferhan and Blunsom, Phil and Setiawan, Hendra and Eidelman, Vladimir and Resnik, Philip},
title = {cdec: A Decoder, Alignment, and Learning Framework for Finite-State and Context-Free Translation Models},
booktitle = {Proceedings of the ACL 2010 System Demonstrations},
month = {July},
year = {2010},
address = {Uppsala, Sweden},
publisher = {Association for Computational Linguistics},
pages = {7--12},
url = {http://www.aclweb.org/anthology/P10-4002}
}
@InProceedings{eisner:2003:ACL-companion,
author = {Jason Eisner},
title = {Learning Non-Isomorphic Tree Mappings for Machine Translation},
booktitle = {The Companion Volume to the Proceedings of 41st Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2003},
address = {Sapporo, Japan},
publisher = {Association for Computational Linguistics},
pages = {205--208},
url = {http://www.aclweb.org/anthology/P03-2039},
doi = {10.3115/1075178.1075217}
}
@InProceedings{galley-EtAl:2006:COLACL,
author = {Galley, Michel and Graehl, Jonathan and Knight, Kevin and Marcu, Daniel and DeNeefe, Steve and Wang, Wei and Thayer, Ignacio},
title = {Scalable Inference and Training of Context-Rich Syntactic Translation Models},
booktitle = {Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2006},
address = {Sydney, Australia},
publisher = {Association for Computational Linguistics},
pages = {961--968},
url = {http://www.aclweb.org/anthology/P06-1121},
doi = {10.3115/1220175.1220296}
}
@InProceedings{galley-manning:2008:EMNLP,
author = {Galley, Michel and Manning, Christopher D.},
title = {A Simple and Effective Hierarchical Phrase Reordering Model},
booktitle = {Proceedings of the 2008 Conference on Empirical Methods in Natural Language Processing},
month = {October},
year = {2008},
address = {Honolulu, Hawaii},
publisher = {Association for Computational Linguistics},
pages = {848--856},
url = {http://www.aclweb.org/anthology/D08-1089}
}
@InProceedings{huang-chiang:2005:IWPT,
author = {Huang, Liang and Chiang, David},
title = {Better k-best Parsing},
booktitle = {Proceedings of the Ninth International Workshop on Parsing Technology},
month = {October},
year = {2005},
address = {Vancouver, British Columbia},
publisher = {Association for Computational Linguistics},
pages = {53--64},
url = {http://www.aclweb.org/anthology/W/W05/W05-1506}
}
@inproceedings{koehn-EtAl:2003:NAACL,
author = {Koehn, Philipp and Och, Franz and Marcu, Daniel},
title = {Statistical Phrase-Based Translation},
booktitle = {Proceedings of the 2003 Human Language Technology Conference of the North American Chapter of the Association for Computational Linguistics},
month = {June},
year = {2003},
address = {Edmonton}
publisher = {The North American Chapter of the Association for Computational Linguistics},
pages = {48--54},
url = {http://aclweb.org/anthology-new/N/N03/N03-1017.pdf}
}
@inproceedings{koehn-EtAl:2007:PosterDemo,
author = {Koehn, Philipp and Hoang, Hieu and Birch, Alexandra and Callison-Burch, Chris and Federico, Marcello and Bertoldi, Nicola and Cowan, Brooke and Shen, Wade and Moran, Christine and Zens, Richard and Dyer, Chris and Bojar, Ondrej and Constantin, Alexandra and Herbst, Evan},
title = {Moses: Open Source Toolkit for Statistical Machine Translation},
booktitle = {Proceedings of the 45th Annual Meeting of the Association for Computational Linguistics Companion Volume Proceedings of the Demo and Poster Sessions},
month = {June},
year = {2007},
address = {Prague, Czech Republic},
publisher = {Association for Computational Linguistics},
pages = {177--180},
url = {http://www.aclweb.org/anthology/P07-2045}
}
@book{koehn:2010:smt,
author = {Koehn, Philipp},
title = {Statistical Machine Translation},
booktitle = {Statistical Machine Translation},
year = {2010},
publisher = {Cambridge University Press},
}
@InProceedings{li-EtAl:2009:WMT1,
author = {Li, Zhifei and Callison-Burch, Chris and Dyer, Chris and Khudanpur, Sanjeev and Schwartz, Lane and Thornton, Wren and Weese, Jonathan and Zaidan, Omar},
title = {{Joshua}: An Open Source Toolkit for Parsing-Based Machine Translation},
booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},
month = {March},
year = {2009},
address = {Athens, Greece},
publisher = {Association for Computational Linguistics},
pages = {135--139},
url = {http://www.aclweb.org/anthology/W09-0424}
}
@article{lopez:2008:ACMComputing,
author = {Lopez, Adam},
title = {Statistical Machine Translation},
journal = {ACM Computing Surveys},
mumber = {3},
pages = {1--49},
volume = {40},
year = {2008}
}
@InProceedings{mi-huang-liu:2008:ACLMain,
author = {Mi, Haitao and Huang, Liang and Liu, Qun},
title = {Forest-Based Translation},
booktitle = {Proceedings of ACL-08: HLT},
month = {June},
year = {2008},
address = {Columbus, Ohio},
publisher = {Association for Computational Linguistics},
pages = {192--199},
url = {http://www.aclweb.org/anthology/P/P08/P08-1023}
}
@InProceedings{marcu-wong:2002:EMNLP02,
author = {Marcu, Daniel and Wong, Daniel},
title = {A Phrase-Based,Joint Probability Model for Statistical Machine Translation},
booktitle = {Proceedings of the 2002 Conference on Empirical Methods in Natural Language Processing},
month = {July},
year = {2002},
publisher = {Association for Computational Linguistics},
pages = {133--139},
url = {http://www.aclweb.org/anthology/W02-1018},
doi = {10.3115/1118693.1118711}
}
@InProceedings{och-ney:2002:ACL,
author = {Och, Franz and Ney, Hermann},
title = {Discriminative Training and Maximum Entropy Models for Statistical Machine Translation},
booktitle = {Proceedings of 40th Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2002},
address = {Philadelphia, Pennsylvania, USA},
publisher = {Association for Computational Linguistics},
pages = {295--302},
url = {http://www.aclweb.org/anthology/P02-1038},
doi = {10.3115/1073083.1073133}
}
@InProceedings{och:2003:ACL,
author = {Och, Franz},
title = {Minimum Error Rate Training in Statistical Machine Translation},
booktitle = {Proceedings of the 41st Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2003},
address = {Sapporo, Japan},
publisher = {Association for Computational Linguistics},
pages = {160--167},
url = {http://www.aclweb.org/anthology/P03-1021},
doi = {10.3115/1075096.1075117}
}
@InProceedings{papineni-EtAl:2002:ACL,
author = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
title = {Bleu: a Method for Automatic Evaluation of Machine Translation},
booktitle = {Proceedings of 40th Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2002},
address = {Philadelphia, Pennsylvania, USA},
publisher = {Association for Computational Linguistics},
pages = {311--318},
url = {http://www.aclweb.org/anthology/P02-1040},
doi = {10.3115/1073083.1073135}
}
@inproceedings{tillman:2004:HLTNAACL,
author = {Tillman, Christoph},
title = {A Unigram Orientation Model for Statistical Machine Translation},
booktitle = {HLT-NAACL 2004: Short Papers},
editor = {Susan Dumais, Daniel Marcu and Salim Roukos},
year = 2004,
month = {May 2 - May 7},
address = {Boston, Massachusetts, USA},
publisher = {Association for Computational Linguistics},
pages = {101--104}
}
@InProceedings{vilar-EtAl:2010:WMT,
author = {Vilar, David and Stein, Daniel and Huck, Matthias and Ney, Hermann},
title = {Jane: Open Source Hierarchical Translation, Extended with Reordering and Lexicon Models},
booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and MetricsMATR},
month = {July},
year = {2010},
address = {Uppsala, Sweden},
publisher = {Association for Computational Linguistics},
pages = {262--270},
url = {http://www.aclweb.org/anthology/W10-1738}
}
@article{wu:1997:CL,
author = {Wu, Dekai},
title = {Stochastic inversion transduction grammars and bilingual parsing of parallel corpora},
journal = {Computational Linguistics},
mumber = {3},
pages = {377--404},
volume = {23},
year = {1997}
}
@InProceedings{xiao-EtAl:2009:CWMT,
author = {Xiao, Tong and Chen, Rushan and Li, Tianning and Zhu, Muhua and Zhu, Jingbo and Wang, Huizhen and Ren, Feiliang},
title = {NEUTrans: a Phrase-Based SMT System for CWMT2009},
booktitle = {Proceedings of the 5th China Workshop on Machine Translation},
month = {Sep},
year = {2009},
address = {Nanjing, China},
publisher = {CWMT},
url = {http://www.icip.org.cn/cwmt2009/downloads/papers/6.pdf}
}
@InProceedings{xiao-EtAl:2011:CWMT,
author = {Xiao, Tong and Zhang, Hao and Li, Qiang and Lu, Qi and Zhu, Jingbo and Ren, Feiliang and Wang, Huizhen},
title = {The NiuTrans Machine Translation System for CWMT2011},
booktitle = {Proceedings of the 6th China Workshop on Machine Translation},
month = {August},
year = {2011},
address = {Xiamen, China},
publisher = {CWMT},
}
@InProceedings{xiao-EtAl:2011:NTCIR,
author = {Xiao, Tong and Li, Qiang and Lu, Qi and Zhang, Hao and Ding, Haibo and Yao, Shujie and Xu, Xiaoming and Fei, Xiaoxu and Zhu, Jingbo and Ren, Feiliang and Wang, Huizhen},
title = {The NiuTrans Machine Translation System for NTCIR-9 PatentMT},
booktitle = {Proceedings of the NTCIR-9 Workshop Meeting},
month = {Dec},
year = {2011},
address = {Tokyo, Japan},
publisher = {NTCIR},
pages = {593--599},
url = {http://research.nii.ac.jp/ntcir/workshop/OnlineProceedings9/NTCIR/04-NTCIR9-PATENTMT-XiaoT.pdf}
}
@InProceedings{xiong-liu-lin:2006:COLACL,
author = {Xiong, Deyi and Liu, Qun and Lin, Shouxun},
title = {Maximum Entropy Based Phrase Reordering Model for Statistical Machine Translation},
booktitle = {Proceedings of the 21st International Conference on Computational Linguistics and 44th Annual Meeting of the Association for Computational Linguistics},
month = {July},
year = {2006},
address = {Sydney, Australia},
publisher = {Association for Computational Linguistics},
pages = {521--528},
url = {http://www.aclweb.org/anthology/P06-1066},
doi = {10.3115/1220175.1220241}
}
@InProceedings{zhang-EtAl:2006:HLT-NAACL06-Main,
author = {Zhang, Hao and Huang, Liang and Gildea, Daniel and Knight, Kevin},
title = {Synchronous Binarization for Machine Translation},
booktitle = {Proceedings of the Human Language Technology Conference of the NAACL, Main Conference},
month = {June},
year = {2006},
address = {New York City, USA},
publisher = {Association for Computational Linguistics},
pages = {256--263},
url = {http://www.aclweb.org/anthology/N/N06/N06-1033}
}
@InProceedings{zollmann-venugopal:2006:WMT,
author = {Zollmann, Andreas and Venugopal, Ashish},
title = {Syntax Augmented Machine Translation via Chart Parsing},
booktitle = {Proceedings on the Workshop on Statistical Machine Translation},
month = {June},
year = {2006},
address = {New York City},
publisher = {Association for Computational Linguistics},
pages = {138--141},
url = {http://www.aclweb.org/anthology/W/W06/W06-3119}
}
%\bibliographystyle{plainnat}
%\bibliographystyle{FG-bibstyle}
%\addcontentsline{toc}{chapter}{Bibliography}
%----------------------------------------------------------------------------------------
% PART
%----------------------------------------------------------------------------------------
\part{哈哈哈哈哈~~爽朗的笑声}
%----------------------------------------------------------------------------------------
% CHAPTER 1
%----------------------------------------------------------------------------------------
\chapterimage{chapter_head_2.pdf} % Chapter heading image
\chapter{我是姜英俊}
\section{我姓姜,我很英俊}\index{Paragraphs of Text}
\lipsum[1-7] % Dummy text
%------------------------------------------------
\section{再说一遍?}\index{Citation}
This statement requires citation \cite{article_key}; this one is more specific \cite[162]{book_key}.
%------------------------------------------------
\section{咋的?}\index{Lists}
Lists are useful to present information in a concise and/or ordered way\footnote{Footnote example...}.
\subsection{我问你,你敢再说一遍不?}\index{Lists!Numbered List}
\begin{enumerate}
\item The first item
\item The second item
\item The third item
\end{enumerate}
\subsection{咋的?}\index{Lists!Bullet Points}
\begin{itemize}
\item The first item
\item The second item
\item The third item
\end{itemize}
\subsection{我靠,你还问我,我问你呢}\index{Lists!Descriptions and Definitions}
\begin{description}
\item[Name] Description
\item[Word] Definition
\item[Comment] Elaboration
\end{description}
\ No newline at end of file
%----------------------------------------------------------------------------------------
% CHAPTER 2
%----------------------------------------------------------------------------------------
\chapterimage{chapter_head_2.pdf} % Chapter heading image
\chapter{In-text Elements}
\section{Theorems}\index{Theorems}
This is an example of theorems.
\subsection{Several equations}\index{Theorems!Several Equations}
This is a theorem consisting of several equations.
\begin{theorem}[Name of the theorem]
In $E=\mathbb{R}^n$ all norms are equivalent. It has the properties:
\begin{align}
& \big| ||\mathbf{x}|| - ||\mathbf{y}|| \big|\leq || \mathbf{x}- \mathbf{y}||\\
& ||\sum_{i=1}^n\mathbf{x}_i||\leq \sum_{i=1}^n||\mathbf{x}_i||\quad\text{where $n$ is a finite integer}
\end{align}
\end{theorem}
\subsection{Single Line}\index{Theorems!Single Line}
This is a theorem consisting of just one line.
\begin{theorem}
A set $\mathcal{D}(G)$ in dense in $L^2(G)$, $|\cdot|_0$.
\end{theorem}
%------------------------------------------------
\section{Definitions}\index{Definitions}
This is an example of a definition. A definition could be mathematical or it could define a concept.
\begin{definition}[Definition name]
Given a vector space $E$, a norm on $E$ is an application, denoted $||\cdot||$, $E$ in $\mathbb{R}^+=[0,+\infty[$ such that:
\begin{align}
& ||\mathbf{x}||=0\ \Rightarrow\ \mathbf{x}=\mathbf{0}\\
& ||\lambda \mathbf{x}||=|\lambda|\cdot ||\mathbf{x}||\\
& ||\mathbf{x}+\mathbf{y}||\leq ||\mathbf{x}||+||\mathbf{y}||
\end{align}
\end{definition}
%------------------------------------------------
\section{Notations}\index{Notations}
\begin{notation}
Given an open subset $G$ of $\mathbb{R}^n$, the set of functions $\varphi$ are:
\begin{enumerate}
\item Bounded support $G$;
\item Infinitely differentiable;
\end{enumerate}
a vector space is denoted by $\mathcal{D}(G)$.
\end{notation}
%------------------------------------------------
\section{Remarks}\index{Remarks}
This is an example of a remark.
\begin{remark}
The concepts presented here are now in conventional employment in mathematics. Vector spaces are taken over the field $\mathbb{K}=\mathbb{R}$, however, established properties are easily extended to $\mathbb{K}=\mathbb{C}$.
\end{remark}
%------------------------------------------------
\section{Corollaries}\index{Corollaries}
This is an example of a corollary.
\begin{corollary}[Corollary name]
The concepts presented here are now in conventional employment in mathematics. Vector spaces are taken over the field $\mathbb{K}=\mathbb{R}$, however, established properties are easily extended to $\mathbb{K}=\mathbb{C}$.
\end{corollary}
%------------------------------------------------
\section{Propositions}\index{Propositions}
This is an example of propositions.
\subsection{Several equations}\index{Propositions!Several Equations}
\begin{proposition}[Proposition name]
It has the properties:
\begin{align}
& \big| ||\mathbf{x}|| - ||\mathbf{y}|| \big|\leq || \mathbf{x}- \mathbf{y}||\\
& ||\sum_{i=1}^n\mathbf{x}_i||\leq \sum_{i=1}^n||\mathbf{x}_i||\quad\text{where $n$ is a finite integer}
\end{align}
\end{proposition}
\subsection{Single Line}\index{Propositions!Single Line}
\begin{proposition}
Let $f,g\in L^2(G)$; if $\forall \varphi\in\mathcal{D}(G)$, $(f,\varphi)_0=(g,\varphi)_0$ then $f = g$.
\end{proposition}
%------------------------------------------------
\section{Examples}\index{Examples}
This is an example of examples.
\subsection{Equation and Text}\index{Examples!Equation and Text}
\begin{example}
Let $G=\{x\in\mathbb{R}^2:|x|<3\}$ and denoted by: $x^0=(1,1)$; consider the function:
\begin{equation}
f(x)=\left\{\begin{aligned} & \mathrm{e}^{|x|} & & \text{si $|x-x^0|\leq 1/2$}\\
& 0 & & \text{si $|x-x^0|> 1/2$}\end{aligned}\right.
\end{equation}
The function $f$ has bounded support, we can take $A=\{x\in\mathbb{R}^2:|x-x^0|\leq 1/2+\epsilon\}$ for all $\epsilon\in\intoo{0}{5/2-\sqrt{2}}$.
\end{example}
\subsection{Paragraph of Text}\index{Examples!Paragraph of Text}
\begin{example}[Example name]
\lipsum[2]
\end{example}
%------------------------------------------------
\section{Exercises}\index{Exercises}
This is an example of an exercise.
\begin{exercise}
This is a good place to ask a question to test learning progress or further cement ideas into students' minds.
\end{exercise}
%------------------------------------------------
\section{Problems}\index{Problems}
\begin{problem}
What is the average airspeed velocity of an unladen swallow?
\end{problem}
%------------------------------------------------
\section{Vocabulary}\index{Vocabulary}
Define a word to improve a students' vocabulary.
\begin{vocabulary}[Word]
Definition of word.
\end{vocabulary}
%----------------------------------------------------------------------------------------
% PART
%----------------------------------------------------------------------------------------
\part{Part Two}
\ No newline at end of file
%----------------------------------------------------------------------------------------
% CHAPTER 3
%----------------------------------------------------------------------------------------
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{chapter_head_1.pdf} % Chapter heading image
\chapter{基于词的翻译模型}
\hspace{2em}统计机器翻译和神经机器翻译在当前具有统治性意义。这两种方法各有优缺点,并没有那种方法具有绝对的优势。但从研究的角度来看,神经机器翻译整体上更具有前沿性。本章主要介绍了统计机器翻译的开山之作—IBM模型,它主要讲了怎么使用词汇对机器翻译进行建模。IBM模型由Peter E. Brown等人在1993年提出,并详细阐述于论文—《The Mathematics of Statistical Machine Translation: Parameter Estimation》。这篇文章的视野和对问题的定义远超当时人所能看到的东西,其衍生出来的一系列方法和新的问题还被后人花费将近10年的时间来进行研究与讨论。
\section{什么是基于词的翻译模型}\index{Chapter3.1}%Index的作用,目前不清晰
\hspace{2em}在机器翻译中,我们希望得到一个源语句到目标语译文的翻译。但机器并不知道如何翻译。因此在做机器翻译的过程中,最初面临的一个问题是:如何对翻译进行建模?从计算机的角度看,建模的目的就是把抽象的问题转换为可计算的问题。所以机器翻译的一个核心问题是:如何将翻译转换为一个可计算的模型或过程。
\noindent\hspace{2em}基于单词的统计机器翻译模型又是如何描述翻译的呢?IBM模型提出了一个观点:在翻译源语句时,通常是把每个源语句的单词翻译成对应的目标语单词,然后调整这些单词的顺序,最后得到翻译结果。尽管在人看来基于单词的对应进行翻译是很自然的事,但是机器并不一定。
\noindent\hspace{2em}举个例子说明如何基于单词的对应进行翻译。如图 \ref{figureC3.1}所示,表示的是汉译英的例子。其中源语句是“我 对 你 感到 满意”。首先我们把源语句的单词“我”、“对”、“\\你”、“感到”和“满意”分别翻译为“I”、“with”、“you”、“am”和“satisfied”,然后调整单词的顺序,比如“am”放在译文的第2个位置,“you”应该放在最后的位置等,最后得到译文“I am satisfied with you”。
%空一行用来段落换行,noindent取消首行缩进,hspace{}指定缩进距离,1em等于两个英文字符|一个汉字
%----------------------------------------------
% 图3.1
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{figure}[htp]
\centering
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (s1) at (0,0) {};
\node [anchor=west] (s2) at ([xshift=0.5em]s1.east) {};
\node [anchor=west] (s3) at ([xshift=0.5em]s2.east) {};
\node [anchor=west] (s4) at ([xshift=0.5em]s3.east) {感到};
\node [anchor=west] (s5) at ([xshift=0.5em]s4.east) {满意};
\end{scope}
\begin{scope}[yshift=-3em]
\node [anchor=west] (t1) at (0.35em,0) {I};
\node [anchor=west] (t2) at ([xshift=0.3em,yshift=-0.1em]t1.east) {am};
\node [anchor=west] (t3) at ([xshift=0.3em,yshift=0.1em]t2.east) {satisfied};
\node [anchor=west] (t4) at ([xshift=0.3em]t3.east) {with};
\node [anchor=west] (t5) at ([xshift=0.3em,yshift=-0.2em]t4.east) {you};
\end{scope}
{
\draw [-,thick,ublue,dashed] (s1.south) -- (t1.north);
\draw [-,thick,ublue,dashed] (s4.south) -- ([yshift=0.3em]t2.north);
\draw [-,thick,ublue,dashed] (s2.south) ..controls +(south:1em) and +(north:1em).. (t4.north);
\draw [-,thick,ublue,dashed] (s3.south) ..controls +(south:0.5em) and +(north:1.5em).. (t5.north);
\draw [-,thick,ublue,dashed] (s5.south) -- (t3.north);
}
\end{tikzpicture}
\caption{此处为图片的描述...例:基于词对应进行翻译.}
\label{figureC3.1}
\end{figure}
%-------------------------------------------
\noindent\hspace{2em}传统观点认为基于词的翻译过程包含如下三个步骤。如图 \ref{figureC3.2}所示。
\noindent\hspace{2em}第一、分析。将源语句或目标语句切分或者表示为能够处理的最小单元的过程。在基于词的翻译模型的最小处理单元就是单词。在这里也可以简单地将分析理解为分词。
\noindent\hspace{2em}第二、转换。把源语句中的每个单词都翻译成目标语单词。此处引用Figure \ref{figureC3.1}作为示例。
\noindent\hspace{2em}第三、生成。基于转换的结果,将目标语译文变成通顺且合乎语法的句子。
%----------------------------------------------
% 图3.2
\begin{figure}[htp]
\centering
\begin{tikzpicture}
\definecolor{ugreen}{rgb}{0,0.5,0}
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\node [anchor=west,draw,thick,minimum width=6.7em,minimum height=0.8em] (sent) at (0,0) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,red] (s1) at ([yshift=-1.5em]sent.south west) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ugreen] (s2) at ([xshift=0.4em]s1.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,orange] (s3) at ([xshift=0.4em]s2.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ublue] (s4) at ([xshift=0.4em]s3.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,purple] (s5) at ([xshift=0.4em]s4.east) {};
{
\node [anchor=west,draw,thick,circle,minimum size=0.3em,red,fill=red] (t1) at ([yshift=-2.0em]s1.west) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ugreen,fill=ugreen] (t2) at ([xshift=0.4em]t1.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,orange,fill=orange] (t3) at ([xshift=0.4em]t2.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ublue,fill=ublue] (t4) at ([xshift=0.4em]t3.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,purple,fill=purple] (t5) at ([xshift=0.4em]t4.east) {};
}
{
\node [anchor=west,draw,thick,circle,minimum size=0.3em,red,fill=red] (ft1) at ([yshift=-2.0em]t1.west) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ublue,fill=ublue] (ft2) at ([xshift=0.4em]ft1.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,purple,fill=purple] (ft3) at ([xshift=0.4em]ft2.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,ugreen,fill=ugreen] (ft4) at ([xshift=0.4em]ft3.east) {};
\node [anchor=west,draw,thick,circle,minimum size=0.3em,orange,fill=orange] (ft5) at ([xshift=0.4em]ft4.east) {};
}
\draw [->,thick,double] ([yshift=-0.1em]sent.south) -- ([yshift=-0.8em]sent.south);
{
\draw [->,thick] ([yshift=-0.1em]s1.south) -- ([yshift=0.1em]t1.north);
\draw [->,thick] ([yshift=-0.1em]s2.south) -- ([yshift=0.1em]t2.north);
\draw [->,thick] ([yshift=-0.1em]s3.south) -- ([yshift=0.1em]t3.north);
\draw [->,thick] ([yshift=-0.1em]s4.south) -- ([yshift=0.1em]t4.north);
\draw [->,thick] ([yshift=-0.1em]s5.south) -- ([yshift=0.1em]t5.north);
}
{
\draw [->,thick] ([yshift=-0.1em]t1.south) -- ([yshift=0.1em]ft1.north);
\draw [->,thick] ([yshift=-0.1em]t2.south) -- ([yshift=0.1em]ft4.north);
\draw [->,thick] ([yshift=-0.1em]t3.south) -- ([yshift=0.1em]ft5.north);
\draw [->,thick] ([yshift=-0.1em]t4.south) -- ([yshift=0.1em]ft2.north);
\draw [->,thick] ([yshift=-0.1em]t5.south) -- ([yshift=0.1em]ft3.north);
}
{
\node [anchor=north west] (label1) at ([yshift=0.3em]sent.south east) {{\scriptsize \textbf{分析}}};
\node [anchor=north west] (label2) at ([yshift=-0.5em]label1.south west) {{\scriptsize \textbf{转换}}};
\node [anchor=north west] (label3) at ([yshift=-0.5em]label2.south west) {{\scriptsize \textbf{生成}}};
}
\end{tikzpicture}
\caption{此处为图片的描述...例:基于词的翻译过程.}
\label{figureC3.2}
\end{figure}
%---------------------------
\noindent\hspace{2em}从现在的角度看,分析、转换和生成依然是非常深刻的一个观点。该过程蕴含于很多的任务,比如句法分析。即使对于神经机器翻译,从大的框架来说,依然在做分析、转换和生成,只不过有些过程隐含在神经网络的设计中。
\section{构建一个简易机器翻译系统}\index{Chapter3.2}%Index的作用,目前不清晰
\noindent\hspace{2em}本节首先对比人工翻译流程和机器翻译流程的异同点,从中我们可以归纳出构建机器翻译系统的两个主要流程:训练和解码。其次从单词翻译概率、句子级翻译模型和解码三个方面描述如何构建一个简易机器翻译系统。其中单词翻译概率和句子级翻译模型属于训练流程。
\subsection{机器翻译的思路}\index{Chapter3.2.1}
\subsubsection{人工翻译流程}\index{Chapter3.2.1.1}
\noindent\hspace{2em}当我们翻译一个句子时,首先会快速地分析出句子中词的构成,然后基于以往的知识,得到每个词的候选翻译,最后利用对单词的理解拼出来一个译文。这描述了人在翻译时的基本过程。尽管并不是严格的从心理学或者脑科学得出来的观点,但至少可以帮助我们理解人的翻译。
%----------------------------------------------
% 图3.3
\begin{figure}[htp]
\centering
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (s1) at (0,0) {\textbf{}};
\node [anchor=west] (s2) at ([xshift=2em]s1.east) {\textbf{}};
\node [anchor=west] (s3) at ([xshift=2em]s2.east) {\textbf{}};
\node [anchor=west] (s4) at ([xshift=2em]s3.east) {\textbf{表示}};
\node [anchor=west] (s5) at ([xshift=2em]s4.east) {\textbf{满意}};
\node [anchor=south west] (sentlabel) at ([yshift=-0.5em]s1.north west) {\scriptsize{\textbf{\color{red}{待翻译句子(已经分词):}}}};
{
\draw [->,very thick,ublue] (s1.south) -- ([yshift=-0.7em]s1.south);
\draw [->,very thick,ublue] (s2.south) -- ([yshift=-0.7em]s2.south);
\draw [->,very thick,ublue] (s3.south) -- ([yshift=-0.7em]s3.south);
\draw [->,very thick,ublue] (s4.south) -- ([yshift=-0.7em]s4.south);
\draw [->,very thick,ublue] (s5.south) -- ([yshift=-0.7em]s5.south);
{\small
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.5em] (t11) at ([yshift=-1em]s1.south) {I};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.5em] (t12) at ([yshift=-0.2em]t11.south) {me};
\node [anchor=north,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.5em] (t13) at ([yshift=-0.2em]t12.south) {I'm};
\node [anchor=north west,inner sep=1pt,fill=black] (tl11) at (t11.north west) {\tiny{{\color{white} \textbf{1}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl12) at (t12.north west) {\tiny{{\color{white} \textbf{1}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl13) at (t13.north west) {\tiny{{\color{white} \textbf{1}}}};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.5em] (t21) at ([yshift=-1em]s2.south) {to};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.5em] (t22) at ([yshift=-0.2em]t21.south) {with};
\node [anchor=north,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.5em] (t23) at ([yshift=-0.2em]t22.south) {for};
\node [anchor=north west,inner sep=1pt,fill=black] (tl21) at (t21.north west) {\tiny{{\color{white} \textbf{2}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl22) at (t22.north west) {\tiny{{\color{white} \textbf{2}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl23) at (t23.north west) {\tiny{{\color{white} \textbf{2}}}};
\node [anchor=north,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.5em] (t31) at ([yshift=-1em]s3.south) {you};
\node [anchor=north west,inner sep=1pt,fill=black] (tl31) at (t31.north west) {\tiny{{\color{white} \textbf{3}}}};
\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=3em] (t41) at ([yshift=-1em]s4.south) {$\phi$};
\node [anchor=north,inner sep=2pt,fill=orange!20,minimum height=1.5em,minimum width=3em] (t42) at ([yshift=-0.2em]t41.south) {show};
\node [anchor=north west,inner sep=1pt,fill=black] (tl41) at (t41.north west) {\tiny{{\color{white} \textbf{4}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl42) at (t42.north west) {\tiny{{\color{white} \textbf{4}}}};
\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=4.5em] (t51) at ([yshift=-1em]s5.south) {satisfy};
\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=4.5em] (t52) at ([yshift=-0.2em]t51.south) {satisfied};
\node [anchor=north,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=4.5em] (t53) at ([yshift=-0.2em]t52.south) {satisfies};
\node [anchor=north west,inner sep=1pt,fill=black] (tl51) at (t51.north west) {\tiny{{\color{white} \textbf{5}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl52) at (t52.north west) {\tiny{{\color{white} \textbf{5}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (tl53) at (t53.north west) {\tiny{{\color{white} \textbf{5}}}};
}
}
\end{scope}
\begin{scope}
{\small
{
\node [anchor=west,inner sep=2pt,fill=red!20,minimum height=1.5em,minimum width=2.5em] (ft11) at ([yshift=-1.2in]t11.west) {I'm};
\node [anchor=center,inner sep=2pt,fill=purple!20,minimum height=1.5em,minimum width=5em] (ft12) at ([xshift=5.0em]ft11.center) {satisfied};
\node [anchor=center,inner sep=2pt,fill=green!20,minimum height=1.5em,minimum width=2.5em] (ft13) at ([xshift=5.0em]ft12.center) {with};
\node [anchor=center,inner sep=2pt,fill=blue!20,minimum height=1.5em,minimum width=2.5em] (ft14) at ([xshift=4.0em]ft13.center) {you};
}
{
\node [anchor=north west,inner sep=1pt,fill=black] (ftl11) at (ft11.north west) {\tiny{{\color{white} \textbf{1}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (ftl12) at (ft12.north west) {\tiny{{\color{white} \textbf{5}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (ftl13) at (ft13.north west) {\tiny{{\color{white} \textbf{2}}}};
\node [anchor=north west,inner sep=1pt,fill=black] (ftl14) at (ft14.north west) {\tiny{{\color{white} \textbf{3}}}};
}
{
\draw [->,thick] ([yshift=-0.1em]t13.south) -- ([yshift=0.1em]ft11.north);
\draw [->,thick] ([yshift=0.1em]t22.south east) ..controls +(280:3em) and +(north:3em).. ([yshift=0.1em]ft13.north);
\draw [->,thick] ([yshift=-0.1em,xshift=0.2em]t31.south west) ..controls +(south:3em) and +(north:3em).. ([yshift=0.1em,xshift=0.2em]ft14.north west);
\draw [->,thick] ([yshift=0.1em]t52.south west) ..controls +(250:4em) and +(north:4em).. ([yshift=0.1em]ft12.north);
\node [anchor=east,inner sep=1pt] (nulltranslabel) at (t42.south west) {\scriptsize{\textbf{翻空}}};
\draw [->,thick] ([yshift=0.1em]t41.south west) ..controls +(250:1em) and +(north:1em).. (nulltranslabel.north);
}
}
\end{scope}
\begin{scope}
{
\node [anchor=north west] (label1) at (ft11.south west) {\small{选择最佳单词翻译,调整词序,得到完美的结果}};
}
{
\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=8em,xshift=-0.5em]t13.south west) -- ([xshift=-0.5em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label2) {\footnotesize{\textbf{学习到的}}};
\node [anchor=north west] (label2part2) at ([yshift=0.3em]label2.south west) {\footnotesize{\textbf{单词翻译}}};
}
{
\draw[decorate,thick,decoration={brace,amplitude=5pt,mirror}] ([yshift=-0.2em,xshift=-0.5em]t13.south west) -- ([yshift=-5em,xshift=-0.5em]t13.south west) node [pos=0.5,left,xshift=-0.5em,yshift=0.5em] (label3) {\footnotesize{\textbf{运用知识}}};
\node [anchor=north west] (label3part2) at ([yshift=0.3em]label3.south west) {\footnotesize{\textbf{生成译文}}};
}
\end{scope}
\end{tikzpicture}
\caption{此处为图片的描述...例:人工翻译的过程}
\label{figureC3.3}
\end{figure}
%---------------------------
\noindent\hspace{2em}如图\ref{figureC3.3}所示,表示的是人工翻译的过程。其中待翻译的句子是“我 对 你表示 满意”,它的一个可能译文是“I’m satisfied with you”。我们可以把上述过程总结为如下两个过程。
\noindent\hspace{2em}第一、学习单词的翻译。即获取待翻译句子的每个单词的候选翻译。比如“对”的可能译文有“to”、“with”和“for”等。对于人来说,可以通过阅读、背诵、做题或者老师教等途径获得,并在大脑中形成知识。这些知识就包含了源语词与目标语词对应关系。我们也把这个过程称之为学习过程。
\noindent\hspace{2em}第二、运用知识生成译文。当翻译一个课本上没有见过的句子时,我们可能会想到词之间的翻译关系、一些常见的单词搭配、主谓宾等语法知识等,比如“satisfied”后面常使用介词“with”来表示对人的满意等,并基于这些知识快速地对译文进行生成。
\subsubsection{机器翻译流程}\index{Chapter3.2.1.2}
\noindent\hspace{2em}那么机器是如何完成翻译的呢?机器翻译可能没有人那么智能。因为一方面没有老师教给它一些非常好用的规则,使它能够快速得到译文。另一方面机器很笨,尽管它能知道每个单词的译文,但是不知道这些译文怎么拼装成句,甚至不知道哪些译文是对的。为了更加直观地理解机器在翻译时的困境,我们将其中需要解决的问题归纳如下。
\noindent\hspace{2em}问题一、如何将单词的译文拼装成句?
\noindent\hspace{2em}问题二、如果可以将译文成句,那如何判断结果的好坏?
\noindent\hspace{2em}对于问题一。机器最擅长的就是计算,它可以类似于走一条路径,尝试把译文拼装成句。如图3.4中蓝色和红色的线,表示的就是两条译文选择路径,区别在于“满意”和“对”的翻译候选是不一样的,蓝色线选择的是“satisfy”和“to”,而红色线是“satisfied”和“with”。换句话说,翻译就是一条译文选择路径,不同的译文对应不同的路径,并且词序也可以不同。机器可以利用它的计算能力找到很多这样的路径。
%-----------公式样例
\subsubsection{如何从一个双语平行数据中学习?}\index{Chapter3.2.3.2}
\noindent\hspace{2em}在上一节的骰子游戏中,我们重复地掷骰子很多次,然后计算“1”到“6”各出现的次数,再除以掷的总次数,最后得到它们出现的近似概率。我们同样用类似的方式估计单词翻译概率。假设$x$表示任意源语单词,所有的目标语单词$y \in Y$都可能是它的译文。当给定一个互译的句对$(s,t)$,我们定义$\textrm{P}(x \leftrightarrow y; s, t)$表示在$(s,t)$$x$$y$互译的可能性或概率。其中$x$属于句子$s$中的词,而$y$属于$t$中的词,具体计算如公式\ref{eqC3.1}\\所示。
\begin{equation}
\textrm{P}(x \leftrightarrow y; s,t) \equiv \textrm{P}(x,y;s,t) = \frac{c(x,y;s,t)}{\sum_{x',y'} c(x',y';s,t)}
\label{eqC3.1}
\end{equation}
\noindent\hspace{2em}上式中分子$c(x,y;s,t)$表示$x$$y$在句对$(s,t)$中共现的总次数,\cite{Aho-Ullman:1969:JCSS}分母$\sum_{x',y'} c(x',y';$ $s,t)$表示任意的$x'$和任意的$y'$$(s,t)$中共现的总次数。\cite{book_key}为了更加清晰的理解如何计算$c(x,y;s,t)$,我们用图3.7描述的算法进行了说明。
\noindent\hspace{2em}这是一个table的样例,word里目前还没有使用表格,因此借用学长论文里的table,如表\ref{table-ner}所示
%----------------------
\begin{table}[htp]
\small
\begin{center}
\begin{tabular}{l|l}
\hline \bf Model & \bf F1 \\
\hline
$best \ published$ \\
BiLSTM-CRF \cite{lample-etal-2016-neural} & 90.94 \\
BiLSTM-CRF+ELMo \cite{peters-etal-2018-deep} & 92.22 \\
BERT Base \cite{devlin2018bert} & 92.40 \\
BERT Large \cite{devlin2018bert} & 92.80 \\
BiLSTM-CRF+PCE \cite{Akbik2019PooledCE} & 93.18 \\
\hline
Random RNNs w/o pre-trained LM & 90.64 \\
DARTS w/o pre-trained LM & 91.05 \\
I-DARTS ($n=2$) w/o pre-trained LM & 90.96 \\
I-DARTS ($n=1$) w/o pre-trained LM & 91.23 \\
\hline
Random RNNs & 92.89 \\
DARTS & 93.13 \\
I-DARTS ($n=2$) & 93.14 \\
I-DARTS ($n=1$) & 93.47 \\
\hline
\end{tabular}
\end{center}
\caption{\label{font-table} F1 scores on the CoNLL-2003 English NER test set. }
\label{table-ner}
\end{table}
%----------------------------------------------------------------------------------------
% INDEX
%----------------------------------------------------------------------------------------
\cleardoublepage % Make sure the index starts on an odd (right side) page
\phantomsection
\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index
\addcontentsline{toc}{chapter}{\textcolor{ocre}{Index}} % Add an Index heading to the table of contents
\printindex % Output the index
%----------------------------------------------------------------------------------------
\ No newline at end of file
%----------------------------------------------------------------------------------------
% TITLE PAGE
%----------------------------------------------------------------------------------------
\begingroup
\thispagestyle{empty} % Suppress headers and footers on the title page
\begin{tikzpicture}[remember picture,overlay]
\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering The Search for a Title\\[15pt] % Book title
{\Large A Profound Subtitle}\\[20pt] % Subtitle
{\huge Dr. John Smith}}}; % Author name
\end{tikzpicture}
\vfill
\endgroup
%----------------------------------------------------------------------------------------
% COPYRIGHT PAGE
%----------------------------------------------------------------------------------------
\newpage
~\vfill
\thispagestyle{empty}
\noindent Copyright \copyright\ 2019 John Smith\\ % Copyright notice
\noindent \textsc{Published by Publisher}\\ % Publisher
\noindent \textsc{book-website.com}\\ % URL
\noindent Licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/3.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.\\ % License information, replace this with your own license (if any)
\noindent \textit{First printing, March 2019} % Printing/edition date
%----------------------------------------------------------------------------------------
% TABLE OF CONTENTS
%----------------------------------------------------------------------------------------
%\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
\chapterimage{chapter_head_1.pdf} % Table of contents heading image
\pagestyle{empty} % Disable headers and footers for the following pages
\tableofcontents % Print the table of contents itself
\cleardoublepage % Forces the first chapter to start on an odd page so it's on the right side of the book
\pagestyle{fancy} % Enable headers and footers again
\ No newline at end of file
......@@ -2212,6 +2212,131 @@ year ={2008},
//biburl = {https://dblp.org/rec/conf/acl/ZhuX11.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tit/Viterbi67,
author = {Andrew J. Viterbi},
title = {Error bounds for convolutional codes and an asymptotically optimum
decoding algorithm},
journal = {{IEEE} Trans. Inf. Theory},
volume = {13},
number = {2},
pages = {260--269},
year = {1967},
url = {https://doi.org/10.1109/TIT.1967.1054010},
doi = {10.1109/TIT.1967.1054010},
timestamp = {Tue, 10 Mar 2020 10:46:11 +0100},
biburl = {https://dblp.org/rec/journals/tit/Viterbi67.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/OchN02,
author = {Franz Josef Och and
Hermann Ney},
title = {Discriminative Training and Maximum Entropy Models for Statistical
Machine Translation},
booktitle = {Proceedings of the 40th Annual Meeting of the Association for Computational
Linguistics, July 6-12, 2002, Philadelphia, PA, {USA}},
pages = {295--302},
publisher = {{ACL}},
year = {2002},
url = {https://www.aclweb.org/anthology/P02-1038/},
timestamp = {Tue, 17 Sep 2019 13:40:53 +0200},
biburl = {https://dblp.org/rec/conf/acl/OchN02.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/coling/OchN04,
author = {Franz Josef Och and
Hermann Ney},
title = {The Alignment Template Approach to Statistical Machine Translation},
journal = {Computational Linguistics},
volume = {30},
number = {4},
pages = {417--449},
year = {2004},
url = {https://doi.org/10.1162/0891201042544884},
doi = {10.1162/0891201042544884},
timestamp = {Sun, 02 Jun 2019 20:56:04 +0200},
biburl = {https://dblp.org/rec/journals/coling/OchN04.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/KumarB05,
author = {Shankar Kumar and
William J. Byrne},
title = {Local Phrase Reordering Models for Statistical Machine Translation},
booktitle = {{HLT/EMNLP} 2005, Human Language Technology Conference and Conference
on Empirical Methods in Natural Language Processing, Proceedings of
the Conference, 6-8 October 2005, Vancouver, British Columbia, Canada},
pages = {161--168},
publisher = {The Association for Computational Linguistics},
year = {2005},
url = {https://www.aclweb.org/anthology/H05-1021/},
timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
biburl = {https://dblp.org/rec/conf/naacl/KumarB05.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/ZhangHGK06,
author = {Hao Zhang and
Liang Huang and
Daniel Gildea and
Kevin Knight},
editor = {Robert C. Moore and
Jeff A. Bilmes and
Jennifer Chu{-}Carroll and
Mark Sanderson},
title = {Synchronous Binarization for Machine Translation},
booktitle = {Human Language Technology Conference of the North American Chapter
of the Association of Computational Linguistics, Proceedings, June
4-9, 2006, New York, New York, {USA}},
publisher = {The Association for Computational Linguistics},
year = {2006},
url = {https://www.aclweb.org/anthology/N06-1033/},
timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
biburl = {https://dblp.org/rec/conf/naacl/ZhangHGK06.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{marcu2006practical,
title="Practical structured learning techniques for natural language processing",
author="Daniel {Marcu} and Harold Charles {Daume}",
journal="Ph.D. thesis, University of Southern California, Los Angeles, CA",
notes="Sourced from Microsoft Academic - https://academic.microsoft.com/paper/2112648537",
year="2006"
}
@article{denero2010phrase,
title="Phrase Alignment Models for Statistical Machine Translation",
author="John Sturdy {DeNero}",
journal="Ph.D. thesis, UC Berkeley",
notes="Sourced from Microsoft Academic - https://academic.microsoft.com/paper/170557285",
year="2010"
}
@article{xue2005building,
title={Building a large annotated Chinese corpus: the Penn Chinese treebank},
author={Xue, Nianwen and Xia, Fei and dong Chiou, Fu and Palmer, Martha},
journal={Journal of Natural Language Engineering},
volume={11},
number={2},
pages={207--238},
year={2005}
}
@article{DBLP:journals/coling/MarcusSM94,
author = {Mitchell P. Marcus and
Beatrice Santorini and
Mary Ann Marcinkiewicz},
title = {Building a Large Annotated Corpus of English: The Penn Treebank},
journal = {Computational Linguistics},
volume = {19},
number = {2},
pages = {313--330},
year = {1993},
timestamp = {Tue, 06 Dec 2005 15:19:37 +0100},
biburl = {https://dblp.org/rec/journals/coling/MarcusSM94.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
%%%%% chapter 4------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -4035,5 +4160,2189 @@ pages ={157-166},
}
%%%%% chapter 6----------------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 7------------------------------------------------------
@article{姚树杰2011,
title={基于句对质量和覆盖度的统计机器翻译训练语料选取},
author={姚树杰 and 肖桐 and 朱靖波},
journal={中文信息学报},
volume={25},
number={2},
pages={72-78},
year={2011},
}
%%%%%%%%%%%%%%%
@misc{provilkov2019bpedropout,
title={BPE-Dropout: Simple and Effective Subword Regularization},
author={Ivan Provilkov and Dmitrii Emelianenko and Elena Voita},
year={2019},
//eprint={1910.13267},
//archivePrefix={arXiv},
//primaryClass={cs.CL}
}
%%%%%%%%%%%%%%%%%%%
@article{DBLP:journals/corr/SennrichHB15,
author = {Rico Sennrich and
Barry Haddow and
Alexandra Birch},
title = {Neural Machine Translation of Rare Words with Subword Units},
journal = {CoRR},
volume = {abs/1508.07909},
year = {2015},
//url = {http://arxiv.org/abs/1508.07909},
//archivePrefix = {arXiv},
//eprint = {1508.07909},
//timestamp = {Mon, 13 Aug 2018 16:47:17 +0200},
//biburl = {https://dblp.org/rec/journals/corr/SennrichHB15.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1207-0580,
author = {Geoffrey E. Hinton and
Nitish Srivastava and
Alex Krizhevsky and
Ilya Sutskever and
Ruslan Salakhutdinov},
title = {Improving neural networks by preventing co-adaptation of feature detectors},
journal = {CoRR},
volume = {abs/1207.0580},
year = {2012},
//url = {http://arxiv.org/abs/1207.0580},
//archivePrefix = {arXiv},
//eprint = {1207.0580},
//timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1207-0580.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Hornic1989Multilayer,
title={Multilayer feedforward networks are universal approximators},
author={Hornic, K},
journal={Neural Networks},
volume={2},
number={5},
pages={359-366},
year={1989},
}
@article{DBLP:journals/corr/abs-1809-10853,
author = {Alexei Baevski and
Michael Auli},
title = {Adaptive Input Representations for Neural Language Modeling},
journal = {CoRR},
volume = {abs/1809.10853},
year = {2018},
//url = {http://arxiv.org/abs/1809.10853},
/archivePrefix = {arXiv},
//eprint = {1809.10853},
//timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1809-10853.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{Stahlberg2019OnNS,
title={On NMT Search Errors and Model Errors: Cat Got Your Tongue?},
author={Felix Stahlberg and Bill Byrne},
booktitle={EMNLP/IJCNLP},
year={2019}
}
@article{DBLP:journals/corr/abs-1810-08398,
author = {Mingbo Ma and
Liang Huang and
Hao Xiong and
Kaibo Liu and
Chuanqiang Zhang and
Zhongjun He and
Hairong Liu and
Xing Li and
Haifeng Wang},
title = {{STACL:} Simultaneous Translation with Integrated Anticipation and
Controllable Latency},
journal = {CoRR},
volume = {abs/1810.08398},
year = {2018},
//url = {http://arxiv.org/abs/1810.08398},
//archivePrefix = {arXiv},
//eprint = {1810.08398},
//timestamp = {Thu, 01 Nov 2018 11:22:30 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-1810-08398.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/StahlbergHSB17,
author = {Felix Stahlberg and
Eva Hasler and
Danielle Saunders and
Bill Byrne},
title = {{SGNMT} - {A} Flexible {NMT} Decoding Platform for Quick Prototyping
of New Models and Search Strategies},
journal = {CoRR},
volume = {abs/1707.06885},
year = {2017},
//url = {http://arxiv.org/abs/1707.06885},
//archivePrefix = {arXiv},
//eprint = {1707.06885},
//timestamp = {Mon, 13 Aug 2018 16:48:37 +0200},
//biburl = {https://dblp.org/rec/journals/corr/StahlbergHSB17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/SennrichHB16,
author = {Rico Sennrich and
Barry Haddow and
Alexandra Birch},
title = {Edinburgh Neural Machine Translation Systems for {WMT} 16},
journal = {CoRR},
volume = {abs/1606.02891},
year = {2016},
//url = {http://arxiv.org/abs/1606.02891},
//archivePrefix = {arXiv},
//eprint = {1606.02891},
//timestamp = {Mon, 13 Aug 2018 16:46:23 +0200},
//biburl = {https://dblp.org/rec/journals/corr/SennrichHB16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wmt/LiLXLLLWZXWFCLL19,
author = {Bei Li and
Yinqiao Li and
Chen Xu and
Ye Lin and
Jiqiang Liu and
Hui Liu and
Ziyang Wang and
Yuhao Zhang and
Nuo Xu and
Zeyang Wang and
Kai Feng and
Hexuan Chen and
Tengbo Liu and
Yanyang Li and
Qiang Wang and
Tong Xiao and
Jingbo Zhu},
editor = {Ondrej Bojar and
Rajen Chatterjee and
Christian Federmann and
Mark Fishel and
Yvette Graham and
Barry Haddow and
Matthias Huck and
Antonio Jimeno{-}Yepes and
Philipp Koehn and
Andr{\'{e}} Martins and
Christof Monz and
Matteo Negri and
Aur{\'{e}}lie N{\'{e}}v{\'{e}}ol and
Mariana L. Neves and
Matt Post and
Marco Turchi and
Karin Verspoor},
title = {The NiuTrans Machine Translation Systems for {WMT19}},
booktitle = {Proceedings of the Fourth Conference on Machine Translation, {WMT}
2019, Florence, Italy, August 1-2, 2019 - Volume 2: Shared Task Papers,
Day 1},
pages = {257--266},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https://doi.org/10.18653/v1/w19-5325},
//doi = {10.18653/v1/w19-5325},
//timestamp = {Tue, 28 Jan 2020 10:30:56 +0100},
//biburl = {https://dblp.org/rec/conf/wmt/LiLXLLLWZXWFCLL19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aaai/DabreF19,
author = {Raj Dabre and
Atsushi Fujita},
title = {Recurrent Stacking of Layers for Compact Neural Machine Translation
Models},
booktitle = {The Thirty-Third {AAAI} Conference on Artificial Intelligence, {AAAI}
2019, The Thirty-First Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2019, The Ninth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2019, Honolulu, Hawaii,
USA, January 27 - February 1, 2019},
pages = {6292--6299},
publisher = {{AAAI} Press},
year = {2019},
//url = {https://doi.org/10.1609/aaai.v33i01.33016292},
//doi = {10.1609/aaai.v33i01.33016292},
//timestamp = {Wed, 25 Sep 2019 11:05:09 +0200},
//biburl = {https://dblp.org/rec/conf/aaai/DabreF19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1712-05877,
author = {Benoit Jacob and
Skirmantas Kligys and
Bo Chen and
Menglong Zhu and
Matthew Tang and
Andrew G. Howard and
Hartwig Adam and
Dmitry Kalenichenko},
title = {Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only
Inference},
journal = {CoRR},
volume = {abs/1712.05877},
year = {2017},
//url = {http://arxiv.org/abs/1712.05877},
//archivePrefix = {arXiv},
//eprint = {1712.05877},
//timestamp = {Mon, 13 Aug 2018 16:48:27 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1712-05877.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1910-10485,
author = {Gabriele Prato and
Ella Charlaix and
Mehdi Rezagholizadeh},
title = {Fully Quantized Transformer for Improved Translation},
journal = {CoRR},
volume = {abs/1910.10485},
year = {2019},
//url = {http://arxiv.org/abs/1910.10485},
//archivePrefix = {arXiv},
//eprint = {1910.10485},
//timestamp = {Fri, 25 Oct 2019 14:59:26 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1910-10485.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1801-05122,
author = {Xiangwen Zhang and
Jinsong Su and
Yue Qin and
Yang Liu and
Rongrong Ji and
Hongji Wang},
title = {Asynchronous Bidirectional Decoding for Neural Machine Translation},
journal = {CoRR},
volume = {abs/1801.05122},
year = {2018},
//url = {http://arxiv.org/abs/1801.05122},
//archivePrefix = {arXiv},
//eprint = {1801.05122},
//timestamp = {Mon, 15 Jul 2019 14:17:41 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1801-05122.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1809-00069,
author = {Liang Huang and
Kai Zhao and
Mingbo Ma},
title = {When to Finish? Optimal Beam Search for Neural Text Generation (modulo
beam size)},
journal = {CoRR},
volume = {abs/1809.00069},
year = {2018},
//url = {http://arxiv.org/abs/1809.00069},
//archivePrefix = {arXiv},
//eprint = {1809.00069},
//timestamp = {Fri, 05 Oct 2018 11:34:52 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1809-00069.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/jcss/FreundS97,
author = {Yoav Freund and
Robert E. Schapire},
title = {A Decision-Theoretic Generalization of On-Line Learning and an Application
to Boosting},
journal = {J. Comput. Syst. Sci.},
volume = {55},
number = {1},
pages = {119--139},
year = {1997},
//url = {https://doi.org/10.1006/jcss.1997.1504},
//doi = {10.1006/jcss.1997.1504},
//timestamp = {Wed, 14 Nov 2018 10:33:59 +0100},
//biburl = {https://dblp.org/rec/journals/jcss/FreundS97.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/XiaoZZW10,
author = {Tong Xiao and
Jingbo Zhu and
Muhua Zhu and
Huizhen Wang},
editor = {Jan Hajic and
Sandra Carberry and
Stephen Clark},
title = {Boosting-Based System Combination for Machine Translation},
booktitle = {{ACL} 2010, Proceedings of the 48th Annual Meeting of the Association
for Computational Linguistics, July 11-16, 2010, Uppsala, Sweden},
pages = {739--748},
publisher = {The Association for Computer Linguistics},
year = {2010},
//url = {https://www.aclweb.org/anthology/P10-1076/},
//timestamp = {Fri, 13 Sep 2019 13:00:43 +0200},
//biburl = {https://dblp.org/rec/conf/acl/XiaoZZW10.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/SimBGSW07,
author = {Khe Chai Sim and
William J. Byrne and
Mark J. F. Gales and
Hichem Sahbi and
Philip C. Woodland},
title = {Consensus Network Decoding for Statistical Machine Translation System
Combination},
booktitle = {Proceedings of the {IEEE} International Conference on Acoustics, Speech,
and Signal Processing, {ICASSP} 2007, Honolulu, Hawaii, USA, April
15-20, 2007},
pages = {105--108},
publisher = {{IEEE}},
year = {2007},
//url = {https://doi.org/10.1109/ICASSP.2007.367174},
//doi = {10.1109/ICASSP.2007.367174},
//timestamp = {Wed, 16 Oct 2019 14:14:52 +0200},
//biburl = {https://dblp.org/rec/conf/icassp/SimBGSW07.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/RostiMS07,
author = {Antti{-}Veikko I. Rosti and
Spyridon Matsoukas and
Richard M. Schwartz},
editor = {John A. Carroll and
Antal van den Bosch and
Annie Zaenen},
title = {Improved Word-Level System Combination for Machine Translation},
booktitle = {{ACL} 2007, Proceedings of the 45th Annual Meeting of the Association
for Computational Linguistics, June 23-30, 2007, Prague, Czech Republic},
publisher = {The Association for Computational Linguistics},
year = {2007},
//url = {https://www.aclweb.org/anthology/P07-1040/},
//timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
//biburl = {https://dblp.org/rec/conf/acl/RostiMS07.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wmt/RostiZMS08,
author = {Antti{-}Veikko I. Rosti and
Bing Zhang and
Spyros Matsoukas and
Richard M. Schwartz},
editor = {Chris Callison{-}Burch and
Philipp Koehn and
Christof Monz and
Josh Schroeder and
Cameron S. Fordyce},
title = {Incremental Hypothesis Alignment for Building Confusion Networks with
Application to Machine Translation System Combination},
booktitle = {Proceedings of the Third Workshop on Statistical Machine Translation,
WMT@ACL 2008, Columbus, Ohio, USA, June 19, 2008},
pages = {183--186},
publisher = {Association for Computational Linguistics},
year = {2008},
//url = {https://www.aclweb.org/anthology/W08-0329/},
//timestamp = {Fri, 13 Sep 2019 13:08:46 +0200},
//biburl = {https://dblp.org/rec/conf/wmt/RostiZMS08.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/LiMJ16,
author = {Jiwei Li and
Will Monroe and
Dan Jurafsky},
title = {A Simple, Fast Diverse Decoding Algorithm for Neural Generation},
journal = {CoRR},
volume = {abs/1611.08562},
year = {2016},
//url = {http://arxiv.org/abs/1611.08562},
//archivePrefix = {arXiv},
//eprint = {1611.08562},
//timestamp = {Mon, 13 Aug 2018 16:48:46 +0200},
//biburl = {https://dblp.org/rec/journals/corr/LiMJ16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/TrombleKOM08,
author = {Roy Tromble and
Shankar Kumar and
Franz Josef Och and
Wolfgang Macherey},
title = {Lattice Minimum Bayes-Risk Decoding for Statistical Machine Translation},
booktitle = {2008 Conference on Empirical Methods in Natural Language Processing,
{EMNLP} 2008, Proceedings of the Conference, 25-27 October 2008, Honolulu,
Hawaii, USA, {A} meeting of SIGDAT, a Special Interest Group of the
{ACL}},
pages = {620--629},
publisher = {{ACL}},
year = {2008},
//url = {https://www.aclweb.org/anthology/D08-1065/},
//timestamp = {Fri, 13 Sep 2019 13:08:45 +0200},
//biburl = {https://dblp.org/rec/conf/emnlp/TrombleKOM08.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aaai/SuTXJSL17,
author = {Jinsong Su and
Zhixing Tan and
Deyi Xiong and
Rongrong Ji and
Xiaodong Shi and
Yang Liu},
editor = {Satinder P. Singh and
Shaul Markovitch},
title = {Lattice-Based Recurrent Neural Network Encoders for Neural Machine
Translation},
booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence,
February 4-9, 2017, San Francisco, California, {USA}},
pages = {3302--3308},
publisher = {{AAAI} Press},
year = {2017},
//url = {http://aaai.org/ocs/index.php/AAAI/AAAI17/paper/view/14320},
//timestamp = {Sun, 31 Mar 2019 12:09:37 +0200},
//biburl = {https://dblp.org/rec/conf/aaai/SuTXJSL17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/BirdL04,
author = {Steven Bird and
Edward Loper},
title = {{NLTK:} The Natural Language Toolkit},
booktitle = {Proceedings of the 42nd Annual Meeting of the Association for Computational
Linguistics, Barcelona, Spain, July 21-26, 2004 - Poster and Demonstration},
publisher = {{ACL}},
year = {2004},
//url = {https://www.aclweb.org/anthology/P04-3031/},
//timestamp = {Wed, 18 Sep 2019 12:15:54 +0200},
//biburl = {https://dblp.org/rec/conf/acl/BirdL04.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{wang-etal-2018-dynamic,
title = {Dynamic Sentence Sampling for Efficient Training of Neural Machine Translation},
author = {Wang and
Rui and
Utiyama and
Masao and
Sumita and
Eiichiro},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
month = {7},
year = {2018},
//address = "Melbourne, Australia",
publisher = {Association for Computational Linguistics},
//url = "https://www.aclweb.org/anthology/P18-2048",
//doi = "10.18653/v1/P18-2048",
pages = {298--304},
//abstract = "Traditional Neural machine translation (NMT) involves a fixed training procedure where each sentence is sampled once during each epoch. In reality, some sentences are well-learned during the initial few epochs; however, using this approach, the well-learned sentences would continue to be trained along with those sentences that were not well learned for 10-30 epochs, which results in a wastage of time. Here, we propose an efficient method to dynamically sample the sentences in order to accelerate the NMT training. In this approach, a weight is assigned to each sentence based on the measured difference between the training costs of two iterations. Further, in each epoch, a certain percentage of sentences are dynamically sampled according to their weights. Empirical results based on the NIST Chinese-to-English and the WMT English-to-German tasks show that the proposed method can significantly accelerate the NMT training and improve the NMT performance.",
}
@inproceedings{garciamartinez:hal-01433161,
title = {{Factored Neural Machine Translation Architectures}},
author = {Garcia-Martinez, Mercedes and Barrault, Lo{\"i}c and Bougares, Fethi},
//URL = {https://hal.archives-ouvertes.fr/hal-01433161},
booktitle = {{International Workshop on Spoken Language Translation (IWSLT'16)}},
//ADDRESS = {Seattle, United States},
year = {2016},
//PDF = {https://hal.archives-ouvertes.fr/hal-01433161/file/FNMTiwslt2016.pdf},
//HAL_ID = {hal-01433161},
//HAL_VERSION = {v1},
}
@article{DBLP:journals/corr/JeanCMB14,
author = {S{\'{e}}bastien Jean and
Kyunghyun Cho and
Roland Memisevic and
Yoshua Bengio},
title = {On Using Very Large Target Vocabulary for Neural Machine Translation},
journal = {CoRR},
volume = {abs/1412.2007},
year = {2014},
//url = {http://arxiv.org/abs/1412.2007},
//archivePrefix = {arXiv},
//eprint = {1412.2007},
//timestamp = {Mon, 13 Aug 2018 16:46:10 +0200},
//biburl = {https://dblp.org/rec/journals/corr/JeanCMB14.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1804-10959,
author = {Taku Kudo},
title = {Subword Regularization: Improving Neural Network Translation Models
with Multiple Subword Candidates},
journal = {CoRR},
volume = {abs/1804.10959},
year = {2018},
//url = {http://arxiv.org/abs/1804.10959},
//archivePrefix = {arXiv},
//eprint = {1804.10959},
//timestamp = {Mon, 13 Aug 2018 16:48:57 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1804-10959.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/ZagoruykoK16,
author = {Sergey Zagoruyko and
Nikos Komodakis},
title = {Wide Residual Networks},
journal = {CoRR},
volume = {abs/1605.07146},
year = {2016},
//url = {http://arxiv.org/abs/1605.07146},
//archivePrefix = {arXiv},
//eprint = {1605.07146},
//timestamp = {Mon, 13 Aug 2018 16:46:42 +0200},
//biburl = {https://dblp.org/rec/journals/corr/ZagoruykoK16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/iet-bmt/Sepas-Moghaddam20,
author = {Alireza Sepas{-}Moghaddam and
Fernando Pereira and
Paulo Lobato Correia},
title = {Face recognition: a novel multi-level taxonomy based survey},
journal = {{IET} Biom.},
volume = {9},
number = {2},
pages = {58--67},
year = {2020},
//url = {https://doi.org/10.1049/iet-bmt.2019.0001},
//doi = {10.1049/iet-bmt.2019.0001},
//timestamp = {Wed, 01 Apr 2020 08:42:20 +0200},
//biburl = {https://dblp.org/rec/journals/iet-bmt/Sepas-Moghaddam20.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{ethayarajh-2019-contextual,
title = {How Contextual are Contextualized Word Representations? Comparing the Geometry of {BERT}, {ELM}o, and {GPT}-2 Embeddings},
author = {Ethayarajh and
Kawin},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
month = {11},
year = {2019},
//address = "Hong Kong, China",
publisher = {Association for Computational Linguistics},
//url = "https://www.aclweb.org/anthology/D19-1006",
//doi = "10.18653/v1/D19-1006",
pages = {55--65},
//abstract = "Replacing static word embeddings with contextualized word representations has yielded significant improvements on many NLP tasks. However, just how contextual are the contextualized representations produced by models such as ELMo and BERT? Are there infinitely many context-specific representations for each word, or are words essentially assigned one of a finite number of word-sense representations? For one, we find that the contextualized representations of all words are not isotropic in any layer of the contextualizing model. While representations of the same word in different contexts still have a greater cosine similarity than those of two different words, this self-similarity is much lower in upper layers. This suggests that upper layers of contextualizing models produce more context-specific representations, much like how upper layers of LSTMs produce more task-specific representations. In all layers of ELMo, BERT, and GPT-2, on average, less than 5{\%} of the variance in a word{'}s contextualized representations can be explained by a static embedding for that word, providing some justification for the success of contextualized representations.",
}
@inproceedings{DBLP:conf/acl/JawaharSS19,
author = {Ganesh Jawahar and
Beno{\^{\i}}t Sagot and
Djam{\'{e}} Seddah},
editor = {Anna Korhonen and
David R. Traum and
Llu{\'{\i}}s M{\`{a}}rquez},
title = {What Does {BERT} Learn about the Structure of Language?},
booktitle = {Proceedings of the 57th Conference of the Association for Computational
Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
Volume 1: Long Papers},
pages = {3651--3657},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https://doi.org/10.18653/v1/p19-1356},
//doi = {10.18653/v1/p19-1356},
//timestamp = {Tue, 28 Jan 2020 10:28:06 +0100},
//biburl = {https://dblp.org/rec/conf/acl/JawaharSS19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1806-00187,
author = {Myle Ott and
Sergey Edunov and
David Grangier and
Michael Auli},
title = {Scaling Neural Machine Translation},
journal = {CoRR},
volume = {abs/1806.00187},
year = {2018},
//url = {http://arxiv.org/abs/1806.00187},
//archivePrefix = {arXiv},
//eprint = {1806.00187},
//timestamp = {Mon, 13 Aug 2018 16:47:40 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1806-00187.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/DuanLXZ09,
author = {Nan Duan and
Mu Li and
Tong Xiao and
Ming Zhou},
title = {The Feature Subspace Method for {SMT} System Combination},
booktitle = {Proceedings of the 2009 Conference on Empirical Methods in Natural
Language Processing, {EMNLP} 2009, 6-7 August 2009, Singapore, {A}
meeting of SIGDAT, a Special Interest Group of the {ACL}},
pages = {1096--1104},
publisher = {{ACL}},
year = {2009},
//url = {https://www.aclweb.org/anthology/D09-1114/},
//timestamp = {Fri, 13 Sep 2019 13:08:45 +0200},
//biburl = {https://dblp.org/rec/conf/emnlp/DuanLXZ09.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1910-10683,
author = {Colin Raffel and
Noam Shazeer and
Adam Roberts and
Katherine Lee and
Sharan Narang and
Michael Matena and
Yanqi Zhou and
Wei Li and
Peter J. Liu},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text
Transformer},
journal = {CoRR},
volume = {abs/1910.10683},
year = {2019},
//url = {http://arxiv.org/abs/1910.10683},
//archivePrefix = {arXiv},
//eprint = {1910.10683},
//timestamp = {Fri, 25 Oct 2019 14:59:26 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1910-10683.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%%%% chapter 7.5.1----------------------------------------------------------------
@inproceedings{DBLP:conf/cvpr/YuYR18,
author = {Xin Yu and
Zhiding Yu and
Srikumar Ramalingam},
title = {Learning Strict Identity Mappings in Deep Residual Networks},
booktitle = {2018 {IEEE} Conference on Computer Vision and Pattern Recognition,
{CVPR} 2018, Salt Lake City, UT, USA, June 18-22, 2018},
pages = {4432--4440},
publisher = {{IEEE} Computer Society},
year = {2018},
//url = {http://openaccess.thecvf.com/content\_cvpr\_2018/html/Yu\_Learning\_Strict\_Identity\_CVPR\_2018\_paper.html},
//doi = {10.1109/CVPR.2018.00466},
//timestamp = {Wed, 16 Oct 2019 14:14:50 +0200},
//biburl = {https://dblp.org/rec/conf/cvpr/YuYR18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/BapnaCFCW18,
author = {Ankur Bapna and
Mia Xu Chen and
Orhan Firat and
Yuan Cao and
Yonghui Wu},
editor = {Ellen Riloff and
David Chiang and
Julia Hockenmaier and
Jun'ichi Tsujii},
title = {Training Deeper Neural Machine Translation Models with Transparent
Attention},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural
Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
pages = {3028--3033},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://doi.org/10.18653/v1/d18-1338},
//doi = {10.18653/v1/d18-1338},
//timestamp = {Tue, 28 Jan 2020 10:28:48 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/BapnaCFCW18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/ZhangTS19,
author = {Biao Zhang and
Ivan Titov and
Rico Sennrich},
editor = {Kentaro Inui and
Jing Jiang and
Vincent Ng and
Xiaojun Wan},
title = {Improving Deep Transformer with Depth-Scaled Initialization and Merged
Attention},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural
Language Processing and the 9th International Joint Conference on
Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
November 3-7, 2019},
pages = {898--909},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https://doi.org/10.18653/v1/D19-1083},
//doi = {10.18653/v1/D19-1083},
//timestamp = {Thu, 12 Dec 2019 13:23:43 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/ZhangTS19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/eccv/HeZRS16,
author = {Kaiming He and
Xiangyu Zhang and
Shaoqing Ren and
Jian Sun},
editor = {Bastian Leibe and
Jiri Matas and
Nicu Sebe and
Max Welling},
title = {Identity Mappings in Deep Residual Networks},
booktitle = {Computer Vision - {ECCV} 2016 - 14th European Conference, Amsterdam,
The Netherlands, October 11-14, 2016, Proceedings, Part {IV}},
//series = {Lecture Notes in Computer Science},
volume = {9908},
pages = {630--645},
publisher = {Springer},
year = {2016},
//url = {https://doi.org/10.1007/978-3-319-46493-0\_38},
//doi = {10.1007/978-3-319-46493-0\_38},
//timestamp = {Wed, 25 Sep 2019 18:11:12 +0200},
//biburl = {https://dblp.org/rec/conf/eccv/HeZRS16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/WuWXTGQLL19,
author = {Lijun Wu and
Yiren Wang and
Yingce Xia and
Fei Tian and
Fei Gao and
Tao Qin and
Jianhuang Lai and
Tie{-}Yan Liu},
editor = {Anna Korhonen and
David R. Traum and
Llu{\'{\i}}s M{\`{a}}rquez},
title = {Depth Growing for Neural Machine Translation},
booktitle = {Proceedings of the 57th Conference of the Association for Computational
Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
Volume 1: Long Papers},
pages = {5558--5563},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https://doi.org/10.18653/v1/p19-1558},
//doi = {10.18653/v1/p19-1558},
//timestamp = {Tue, 28 Jan 2020 10:27:34 +0100},
//biburl = {https://dblp.org/rec/conf/acl/WuWXTGQLL19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/HuangLW16a,
author = {Gao Huang and
Zhuang Liu and
Kilian Q. Weinberger},
title = {Densely Connected Convolutional Networks},
journal = {CoRR},
volume = {abs/1608.06993},
year = {2016},
//url = {http://arxiv.org/abs/1608.06993},
//archivePrefix = {arXiv},
//eprint = {1608.06993},
//timestamp = {Mon, 10 Sep 2018 15:49:32 +0200},
//biburl = {https://dblp.org/rec/journals/corr/HuangLW16a.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1810-10181,
author = {Zi{-}Yi Dou and
Zhaopeng Tu and
Xing Wang and
Shuming Shi and
Tong Zhang},
title = {Exploiting Deep Representations for Neural Machine Translation},
journal = {CoRR},
volume = {abs/1810.10181},
year = {2018},
//url = {http://arxiv.org/abs/1810.10181},
//archivePrefix = {arXiv},
//eprint = {1810.10181},
//timestamp = {Tue, 15 Jan 2019 11:48:13 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-1810-10181.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/GreffSS16,
author = {Klaus Greff and
Rupesh Kumar Srivastava and
J{\"{u}}rgen Schmidhuber},
title = {Highway and Residual Networks learn Unrolled Iterative Estimation},
journal = {CoRR},
volume = {abs/1612.07771},
year = {2016},
//url = {http://arxiv.org/abs/1612.07771},
//archivePrefix = {arXiv},
//eprint = {1612.07771},
///timestamp = {Mon, 13 Aug 2018 16:48:07 +0200},
//biburl = {https://dblp.org/rec/journals/corr/GreffSS16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/XiaQCBYL17,
author = {Yingce Xia and
Tao Qin and
Wei Chen and
Jiang Bian and
Nenghai Yu and
Tie{-}Yan Liu},
title = {Dual Supervised Learning},
journal = {CoRR},
volume = {abs/1707.00415},
year = {2017},
//url = {http://arxiv.org/abs/1707.00415},
//archivePrefix = {arXiv},
//eprint = {1707.00415},
//timestamp = {Tue, 03 Sep 2019 16:31:11 +0200},
//biburl = {https://dblp.org/rec/journals/corr/XiaQCBYL17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/HeXQWYLM16,
author = {Di He and
Yingce Xia and
Tao Qin and
Liwei Wang and
Nenghai Yu and
Tie{-}Yan Liu and
Wei{-}Ying Ma},
editor = {Daniel D. Lee and
Masashi Sugiyama and
Ulrike von Luxburg and
Isabelle Guyon and
Roman Garnett},
title = {Dual Learning for Machine Translation},
booktitle = {Advances in Neural Information Processing Systems 29: Annual Conference
on Neural Information Processing Systems 2016, December 5-10, 2016,
Barcelona, Spain},
pages = {820--828},
year = {2016},
//url = {http://papers.nips.cc/paper/6469-dual-learning-for-machine-translation},
//timestamp = {Fri, 06 Mar 2020 17:00:15 +0100},
//biburl = {https://dblp.org/rec/conf/nips/HeXQWYLM16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/nips/SuttonMSM99,
author = {Richard S. Sutton and
David A. McAllester and
Satinder P. Singh and
Yishay Mansour},
editor = {Sara A. Solla and
Todd K. Leen and
Klaus{-}Robert M{\"{u}}ller},
title = {Policy Gradient Methods for Reinforcement Learning with Function Approximation},
booktitle = {Advances in Neural Information Processing Systems 12, {[NIPS} Conference,
Denver, Colorado, USA, November 29 - December 4, 1999]},
pages = {1057--1063},
publisher = {The {MIT} Press},
year = {1999},
//url = {http://papers.nips.cc/paper/1713-policy-gradient-methods-for-reinforcement-learning-with-function-approximation},
//timestamp = {Fri, 06 Mar 2020 16:58:30 +0100},
//biburl = {https://dblp.org/rec/conf/nips/SuttonMSM99.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/GulcehreFXCBLBS15,
author = {{\c{C}}aglar G{\"{u}}l{\c{c}}ehre and
Orhan Firat and
Kelvin Xu and
Kyunghyun Cho and
Lo{\"{\i}}c Barrault and
Huei{-}Chi Lin and
Fethi Bougares and
Holger Schwenk and
Yoshua Bengio},
title = {On Using Monolingual Corpora in Neural Machine Translation},
journal = {CoRR},
volume = {abs/1503.03535},
year = {2015},
//url = {http://arxiv.org/abs/1503.03535},
//archivePrefix = {arXiv},
//eprint = {1503.03535},
//timestamp = {Mon, 13 Aug 2018 16:46:37 +0200},
//biburl = {https://dblp.org/rec/journals/corr/GulcehreFXCBLBS15.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wmt/CurreyBH17,
author = {Anna Currey and
Antonio Valerio Miceli Barone and
Kenneth Heafield},
editor = {Ondrej Bojar and
Christian Buck and
Rajen Chatterjee and
Christian Federmann and
Yvette Graham and
Barry Haddow and
Matthias Huck and
Antonio Jimeno{-}Yepes and
Philipp Koehn and
Julia Kreutzer},
title = {Copied Monolingual Data Improves Low-Resource Neural Machine Translation},
booktitle = {Proceedings of the Second Conference on Machine Translation, {WMT}
2017, Copenhagen, Denmark, September 7-8, 2017},
pages = {148--156},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https://doi.org/10.18653/v1/w17-4715},
//doi = {10.18653/v1/w17-4715},
//timestamp = {Tue, 28 Jan 2020 10:31:00 +0100},
//biburl = {https://dblp.org/rec/conf/wmt/CurreyBH17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/SennrichHB16,
author = {Rico Sennrich and
Barry Haddow and
Alexandra Birch},
title = {Improving Neural Machine Translation Models with Monolingual Data},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
1: Long Papers},
publisher = {The Association for Computer Linguistics},
year = {2016},
//url = {https://doi.org/10.18653/v1/p16-1009},
//doi = {10.18653/v1/p16-1009},
//timestamp = {Tue, 28 Jan 2020 10:28:01 +0100},
//biburl = {https://dblp.org/rec/conf/acl/SennrichHB16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/EdunovOAG18,
author = {Sergey Edunov and
Myle Ott and
Michael Auli and
David Grangier},
editor = {Ellen Riloff and
David Chiang and
Julia Hockenmaier and
Jun'ichi Tsujii},
title = {Understanding Back-Translation at Scale},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural
Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
pages = {489--500},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://doi.org/10.18653/v1/d18-1045},
//doi = {10.18653/v1/d18-1045},
//timestamp = {Tue, 28 Jan 2020 10:28:36 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/EdunovOAG18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/DomhanH17,
author = {Tobias Domhan and
Felix Hieber},
editor = {Martha Palmer and
Rebecca Hwa and
Sebastian Riedel},
title = {Using Target-side Monolingual Data for Neural Machine Translation
through Multi-task Learning},
booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural
Language Processing, {EMNLP} 2017, Copenhagen, Denmark, September
9-11, 2017},
pages = {1500--1505},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https://doi.org/10.18653/v1/d17-1158},
//doi = {10.18653/v1/d17-1158},
//timestamp = {Tue, 28 Jan 2020 10:28:22 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/DomhanH17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2002-11794,
author = {Zhuohan Li and
Eric Wallace and
Sheng Shen and
Kevin Lin and
Kurt Keutzer and
Dan Klein and
Joseph E. Gonzalez},
title = {Train Large, Then Compress: Rethinking Model Size for Efficient Training
and Inference of Transformers},
journal = {CoRR},
volume = {abs/2002.11794},
year = {2020},
//url = {https://arxiv.org/abs/2002.11794},
//archivePrefix = {arXiv},
//eprint = {2002.11794},
//timestamp = {Tue, 03 Mar 2020 14:32:13 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-2002-11794.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/FrankleC19,
author = {Jonathan Frankle and
Michael Carbin},
title = {The Lottery Ticket Hypothesis: Finding Sparse, Trainable Neural Networks},
booktitle = {7th International Conference on Learning Representations, {ICLR} 2019,
New Orleans, LA, USA, May 6-9, 2019},
publisher = {OpenReview.net},
year = {2019},
//url = {https://openreview.net/forum?id=rJl-b3RcF7},
//timestamp = {Thu, 25 Jul 2019 13:03:15 +0200},
//biburl = {https://dblp.org/rec/conf/iclr/FrankleC19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/KimR16,
author = {Yoon Kim and
Alexander M. Rush},
editor = {Jian Su and
Xavier Carreras and
Kevin Duh},
title = {Sequence-Level Knowledge Distillation},
booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural
Language Processing, {EMNLP} 2016, Austin, Texas, USA, November 1-4,
2016},
pages = {1317--1327},
publisher = {The Association for Computational Linguistics},
year = {2016},
//url = {https://doi.org/10.18653/v1/d16-1139},
//doi = {10.18653/v1/d16-1139},
//timestamp = {Tue, 28 Jan 2020 10:28:22 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/KimR16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1909-10351,
author = {Xiaoqi Jiao and
Yichun Yin and
Lifeng Shang and
Xin Jiang and
Xiao Chen and
Linlin Li and
Fang Wang and
Qun Liu},
title = {TinyBERT: Distilling {BERT} for Natural Language Understanding},
journal = {CoRR},
volume = {abs/1909.10351},
year = {2019},
//url = {http://arxiv.org/abs/1909.10351},
//archivePrefix = {arXiv},
//eprint = {1909.10351},
//timestamp = {Fri, 27 Sep 2019 13:04:21 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1909-10351.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1901-09069,
author = {Felipe Almeida and
Geraldo Xex{\'{e}}o},
title = {Word Embeddings: {A} Survey},
journal = {CoRR},
volume = {abs/1901.09069},
year = {2019},
//url = {http://arxiv.org/abs/1901.09069},
//archivePrefix = {arXiv},
//eprint = {1901.09069},
//timestamp = {Sat, 02 Feb 2019 16:56:00 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-1901-09069.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2002-06823,
author = {Jinhua Zhu and
Yingce Xia and
Lijun Wu and
Di He and
Tao Qin and
Wengang Zhou and
Houqiang Li and
Tie{-}Yan Liu},
title = {Incorporating {BERT} into Neural Machine Translation},
journal = {CoRR},
volume = {abs/2002.06823},
year = {2020},
//url = {https://arxiv.org/abs/2002.06823},
//archivePrefix = {arXiv},
//eprint = {2002.06823},
//timestamp = {Mon, 02 Mar 2020 16:46:06 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-2002-06823.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/Ruder17a,
author = {Sebastian Ruder},
title = {An Overview of Multi-Task Learning in Deep Neural Networks},
journal = {CoRR},
volume = {abs/1706.05098},
year = {2017},
//url = {http://arxiv.org/abs/1706.05098},
//archivePrefix = {arXiv},
//eprint = {1706.05098},
//timestamp = {Mon, 13 Aug 2018 16:48:50 +0200},
//biburl = {https://dblp.org/rec/journals/corr/Ruder17a.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iccv/ZhuPIE17,
author = {Jun{-}Yan Zhu and
Taesung Park and
Phillip Isola and
Alexei A. Efros},
title = {Unpaired Image-to-Image Translation Using Cycle-Consistent Adversarial
Networks},
booktitle = {{IEEE} International Conference on Computer Vision, {ICCV} 2017, Venice,
Italy, October 22-29, 2017},
pages = {2242--2251},
publisher = {{IEEE} Computer Society},
year = {2017},
//url = {https://doi.org/10.1109/ICCV.2017.244},
//doi = {10.1109/ICCV.2017.244},
//timestamp = {Wed, 16 Oct 2019 14:14:51 +0200},
//biburl = {https://dblp.org/rec/conf/iccv/ZhuPIE17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{philipAlgorithmfordataCompression,
title={A New Algorithm for Data Compression},
author={Philip Gage},
journal={The C Users Journal archive},
volume={12},
number={2},
pages={23--38},
year= {1994}
}
@article{bengioCurriculumlearning,
author = {Yoshu Bengio and
Jerome Louradour and
Ronman Collobert and
Jason Weston},
title = {Curriculum learning},
booktitle={Proceedings of the 26th annual international conference on machine learning},
pages={41--48},
year={2009}
}
@inproceedings{Hubara2016BinarizedNN,
title={Binarized Neural Networks},
author={Itay Hubara and
Matthieu Courbariaux and
Daniel Soudry and
Ran El-Yaniv and
Yoshua Bengio},
booktitle={Advances in neural information processing systems},
pages={4107--4115},
year={2016}}
@article{deeplearning,
title={deep learning},
author={Yann LeCun and
Yoshua Bengio and
Geoffrey Hinton},
journal={nature},
volume={521},
number={7553},
pages={436--444},
year={2015},
publisher={Nature Publishing Group}
}
%%%%%%%%%%%%%%%%%%7.6%%%%%%%%%%%%%%%%
@inproceedings{DBLP:conf/acl/ArtetxeLA19,
author = {Mikel Artetxe and
Gorka Labaka and
Eneko Agirre},
//editor = {Anna Korhonen and
David R. Traum and
Llu{\'{\i}}s M{\`{a}}rquez},
title = {An Effective Approach to Unsupervised Machine Translation},
booktitle = {Proceedings of the 57th Conference of the Association for Computational
Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
Volume 1: Long Papers},
pages = {194--203},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/p19-1019},
//doi = {10.18653/v1/p19-1019},
//timestamp = {Tue, 28 Jan 2020 10:27:42 +0100},
//biburl = {https://dblp.org/rec/conf/acl/ArtetxeLA19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2004-05516,
author = {Kelly Marchisio and
Kevin Duh and
Philipp Koehn},
title = {When Does Unsupervised Machine Translation Work?},
journal = {CoRR},
volume = {abs/2004.05516},
year = {2020},
//url = {https://arxiv.org/abs/2004.05516},
//archivePrefix = {arXiv},
//eprint = {2004.05516},
//timestamp = {Tue, 14 Apr 2020 16:40:34 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-2004-05516.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/HitschlerSR16,
author = {Julian Hitschler and
Shigehiko Schamoni and
Stefan Riezler},
title = {Multimodal Pivots for Image Caption Translation},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
1: Long Papers},
publisher = {The Association for Computer Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/p16-1227},
//doi = {10.18653/v1/p16-1227},
//timestamp = {Tue, 28 Jan 2020 10:27:27 +0100},
//biburl = {https://dblp.org/rec/conf/acl/HitschlerSR16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{elliott2015multilingual,
title={Multilingual Image Description with Neural Sequence Models},
author={Elliott, Desmond and Frank, Stella and Hasler, Eva},
journal={arXiv: Computation and Language},
year={2015}}
@inproceedings{DBLP:conf/wmt/HuangLSOD16,
author = {Po{-}Yao Huang and
Frederick Liu and
Sz{-}Rung Shiang and
Jean Oh and
Chris Dyer},
title = {Attention-based Multimodal Neural Machine Translation},
booktitle = {Proceedings of the First Conference on Machine Translation, {WMT}
2016, colocated with {ACL} 2016, August 11-12, Berlin, Germany},
pages = {639--645},
publisher = {The Association for Computer Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/w16-2360},
//doi = {10.18653/v1/w16-2360},
//timestamp = {Tue, 28 Jan 2020 10:31:01 +0100},
//biburl = {https://dblp.org/rec/conf/wmt/HuangLSOD16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/CurreyH18,
author = {Anna Currey and
Kenneth Heafield},
//editor = {Ellen Riloff and
David Chiang and
Julia Hockenmaier and
Jun'ichi Tsujii},
title = {Multi-Source Syntactic Neural Machine Translation},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural
Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
pages = {2961--2966},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https:////doi.org/10.18653/v1/d18-1327},
//doi = {10.18653/v1/d18-1327},
//timestamp = {Tue, 28 Jan 2020 10:28:16 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/CurreyH18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/SaundersSGB18,
author = {Danielle Saunders and
Felix Stahlberg and
Adri{\`{a}} de Gispert and
Bill Byrne},
//editor = {Iryna Gurevych and
Yusuke Miyao},
title = {Multi-representation ensembles and delayed {SGD} updates improve syntax-based
{NMT}},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
2: Short Papers},
pages = {319--325},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://www.aclweb.org/anthology/P18-2051/},
//doi = {10.18653/v1/P18-2051},
//timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
//biburl = {https://dblp.org/rec/conf/acl/SaundersSGB18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/wmt/NadejdeRSDJKB17,
author = {Maria Nadejde and
Siva Reddy and
Rico Sennrich and
Tomasz Dwojak and
Marcin Junczys{-}Dowmunt and
Philipp Koehn and
Alexandra Birch},
//editor = {Ondrej Bojar and
Christian Buck and
Rajen Chatterjee and
Christian Federmann and
Yvette Graham and
Barry Haddow and
Matthias Huck and
Antonio Jimeno{-}Yepes and
Philipp Koehn and
Julia Kreutzer},
title = {Predicting Target Language {CCG} Supertags Improves Neural Machine
Translation},
booktitle = {Proceedings of the Second Conference on Machine Translation, {WMT}
2017, Copenhagen, Denmark, September 7-8, 2017},
pages = {68--79},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https:////doi.org/10.18653/v1/w17-4707},
//doi = {10.18653/v1/w17-4707},
//timestamp = {Tue, 28 Jan 2020 10:31:04 +0100},
//biburl = {https://dblp.org/rec/conf/wmt/NadejdeRSDJKB17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/SumitaUZTM18,
author = {Chunpeng Ma and
Akihiro Tamura and
Masao Utiyama and
Tiejun Zhao and
Eiichiro Sumita},
//editor = {Iryna Gurevych and
Yusuke Miyao},
title = {Forest-Based Neural Machine Translation},
booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2018, Melbourne, Australia, July 15-20, 2018, Volume
1: Long Papers},
pages = {1253--1263},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://www.aclweb.org/anthology/P18-1116/},
//doi = {10.18653/v1/P18-1116},
//timestamp = {Mon, 16 Sep 2019 13:46:41 +0200},
//biburl = {https://dblp.org/rec/conf/acl/SumitaUZTM18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/coling/ZaremoodiH18,
author = {Poorya Zaremoodi and
Gholamreza Haffari},
//editor = {Emily M. Bender and
Leon Derczynski and
Pierre Isabelle},
title = {Incorporating Syntactic Uncertainty in Neural Machine Translation
with a Forest-to-Sequence Model},
booktitle = {Proceedings of the 27th International Conference on Computational
Linguistics, {COLING} 2018, Santa Fe, New Mexico, USA, August 20-26,
2018},
pages = {1421--1429},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://www.aclweb.org/anthology/C18-1120/},
//timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
//biburl = {https://dblp.org/rec/conf/coling/ZaremoodiH18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/TaiSM15,
author = {Kai Sheng Tai and
Richard Socher and
Christopher D. Manning},
title = {Improved Semantic Representations From Tree-Structured Long Short-Term
Memory Networks},
booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational
Linguistics and the 7th International Joint Conference on Natural
Language Processing of the Asian Federation of Natural Language Processing,
{ACL} 2015, July 26-31, 2015, Beijing, China, Volume 1: Long Papers},
pages = {1556--1566},
publisher = {The Association for Computer Linguistics},
year = {2015},
//url = {https:////doi.org/10.3115/v1/p15-1150},
//doi = {10.3115/v1/p15-1150},
//timestamp = {Tue, 28 Jan 2020 10:28:03 +0100},
//biburl = {https://dblp.org/rec/conf/acl/TaiSM15.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/ShenTSC19,
author = {Yikang Shen and
Shawn Tan and
Alessandro Sordoni and
Aaron C. Courville},
title = {Ordered Neurons: Integrating Tree Structures into Recurrent Neural
Networks},
booktitle = {7th International Conference on Learning Representations, {ICLR} 2019,
New Orleans, LA, USA, May 6-9, 2019},
publisher = {OpenReview.net},
year = {2019},
//url = {https://openreview.net/forum?id=B1l6qiR5F7},
//timestamp = {Thu, 25 Jul 2019 13:03:16 +0200},
//biburl = {https://dblp.org/rec/conf/iclr/ShenTSC19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/EriguchiHT16,
author = {Akiko Eriguchi and
Kazuma Hashimoto and
Yoshimasa Tsuruoka},
title = {Tree-to-Sequence Attentional Neural Machine Translation},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
1: Long Papers},
publisher = {The Association for Computer Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/p16-1078},
//doi = {10.18653/v1/p16-1078},
//timestamp = {Tue, 28 Jan 2020 10:27:49 +0100},
//biburl = {https://dblp.org/rec/conf/acl/EriguchiHT16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/ChenHCC17,
author = {Huadong Chen and
Shujian Huang and
David Chiang and
Jiajun Chen},
//editor = {Regina Barzilay and
Min{-}Yen Kan},
title = {Improved Neural Machine Translation with a Syntax-Aware Encoder and
Decoder},
booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2017, Vancouver, Canada, July 30 - August 4, Volume
1: Long Papers},
pages = {1936--1945},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https:////doi.org/10.18653/v1/P17-1177},
//doi = {10.18653/v1/P17-1177},
//timestamp = {Tue, 20 Aug 2019 11:59:28 +0200},
//biburl = {https://dblp.org/rec/conf/acl/ChenHCC17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/discomt/TiedemannS17,
author = {J{\"{o}}rg Tiedemann and
Yves Scherrer},
//editor = {Bonnie L. Webber and
Andrei Popescu{-}Belis and
J{\"{o}}rg Tiedemann},
title = {Neural Machine Translation with Extended Context},
booktitle = {Proceedings of the Third Workshop on Discourse in Machine Translation,
DiscoMT@EMNLP 2017, Copenhagen, Denmark, September 8, 2017},
pages = {82--92},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https:////doi.org/10.18653/v1/w17-4811},
//doi = {10.18653/v1/w17-4811},
//timestamp = {Fri, 27 Mar 2020 08:51:21 +0100},
//biburl = {https://dblp.org/rec/conf/discomt/TiedemannS17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/JeanLFC17,
author = {S{\'{e}}bastien Jean and
Stanislas Lauly and
Orhan Firat and
Kyunghyun Cho},
title = {Does Neural Machine Translation Benefit from Larger Context?},
journal = {CoRR},
volume = {abs/1704.05135},
year = {2017},
//url = {http://arxiv.org/abs/1704.05135},
//archivePrefix = {arXiv},
//eprint = {1704.05135},
//timestamp = {Mon, 13 Aug 2018 16:47:43 +0200},
//biburl = {https://dblp.org/rec/journals/corr/JeanLFC17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/ZhangLSZXZL18,
author = {Jiacheng Zhang and
Huanbo Luan and
Maosong Sun and
Feifei Zhai and
Jingfang Xu and
Min Zhang and
Yang Liu},
//editor = {Ellen Riloff and
David Chiang and
Julia Hockenmaier and
Jun'ichi Tsujii},
title = {Improving the Transformer Translation Model with Document-Level Context},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural
Language Processing, Brussels, Belgium, October 31 - November 4, 2018},
pages = {533--542},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https:////doi.org/10.18653/v1/d18-1049},
//doi = {10.18653/v1/d18-1049},
//timestamp = {Tue, 28 Jan 2020 10:28:52 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/ZhangLSZXZL18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{tan-etal-2019-hierarchical,
title = "Hierarchical Modeling of Global Context for Document-Level Neural Machine Translation",
author = "Tan, Xin and
Zhang, Longyin and
Xiong, Deyi and
Zhou, Guodong",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
//url = "https://www.aclweb.org/anthology/D19-1168",
//doi = "10.18653/v1/D19-1168",
pages = "1576--1585",
}
@inproceedings{DBLP:conf/naacl/MarufMH19,
author = {Sameen Maruf and
Andr{\'{e}} F. T. Martins and
Gholamreza Haffari},
//editor = {Jill Burstein and
Christy Doran and
Thamar Solorio},
title = {Selective Attention for Context-aware Neural Machine Translation},
booktitle = {Proceedings of the 2019 Conference of the North American Chapter of
the Association for Computational Linguistics: Human Language Technologies,
{NAACL-HLT} 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long
and Short Papers)},
pages = {3092--3102},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/n19-1313},
//doi = {10.18653/v1/n19-1313},
//timestamp = {Tue, 28 Jan 2020 10:30:01 +0100},
//biburl = {https://dblp.org/rec/conf/naacl/MarufMH19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/coling/KuangXLZ18,
author = {Shaohui Kuang and
Deyi Xiong and
Weihua Luo and
Guodong Zhou},
//editor = {Emily M. Bender and
Leon Derczynski and
Pierre Isabelle},
title = {Modeling Coherence for Neural Machine Translation with Dynamic and
Topic Caches},
booktitle = {Proceedings of the 27th International Conference on Computational
Linguistics, {COLING} 2018, Santa Fe, New Mexico, USA, August 20-26,
2018},
pages = {596--606},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https://www.aclweb.org/anthology/C18-1050/},
//timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
//biburl = {https://dblp.org/rec/conf/coling/KuangXLZ18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tacl/TuLSZ18,
author = {Zhaopeng Tu and
Yang Liu and
Shuming Shi and
Tong Zhang},
title = {Learning to Remember Translation History with a Continuous Cache},
journal = {Trans. Assoc. Comput. Linguistics},
volume = {6},
pages = {407--420},
year = {2018},
//url = {https://transacl.org/ojs/index.php/tacl/article/view/1247},
//timestamp = {Thu, 02 Apr 2020 08:34:55 +0200},
//biburl = {https://dblp.org/rec/journals/tacl/TuLSZ18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aaai/XiongH0W19,
author = {Hao Xiong and
Zhongjun He and
Hua Wu and
Haifeng Wang},
title = {Modeling Coherence for Discourse Neural Machine Translation},
booktitle = {The Thirty-Third {AAAI} Conference on Artificial Intelligence, {AAAI}
2019, The Thirty-First Innovative Applications of Artificial Intelligence
Conference, {IAAI} 2019, The Ninth {AAAI} Symposium on Educational
Advances in Artificial Intelligence, {EAAI} 2019, Honolulu, Hawaii,
USA, January 27 - February 1, 2019},
pages = {7338--7345},
publisher = {{AAAI} Press},
year = {2019},
//url = {https:////doi.org/10.1609/aaai.v33i01.33017338},
//doi = {10.1609/aaai.v33i01.33017338},
//timestamp = {Wed, 25 Sep 2019 11:05:09 +0200},
//biburl = {https://dblp.org/rec/conf/aaai/XiongH0W19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/VoitaST19,
author = {Elena Voita and
Rico Sennrich and
Ivan Titov},
//editor = {Anna Korhonen and
David R. Traum and
Llu{\'{\i}}s M{\`{a}}rquez},
title = {When a Good Translation is Wrong in Context: Context-Aware Machine
Translation Improves on Deixis, Ellipsis, and Lexical Cohesion},
booktitle = {Proceedings of the 57th Conference of the Association for Computational
Linguistics, {ACL} 2019, Florence, Italy, July 28- August 2, 2019,
Volume 1: Long Papers},
pages = {1198--1212},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/p19-1116},
//doi = {10.18653/v1/p19-1116},
//timestamp = {Tue, 28 Jan 2020 10:27:35 +0100},
//biburl = {https://dblp.org/rec/conf/acl/VoitaST19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/VoitaST19,
author = {Elena Voita and
Rico Sennrich and
Ivan Titov},
//editor = {Kentaro Inui and
Jing Jiang and
Vincent Ng and
Xiaojun Wan},
title = {Context-Aware Monolingual Repair for Neural Machine Translation},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural
Language Processing and the 9th International Joint Conference on
Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
November 3-7, 2019},
pages = {877--886},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/D19-1081},
//doi = {10.18653/v1/D19-1081},
//timestamp = {Thu, 12 Dec 2019 13:23:45 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/VoitaST19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1910-00553,
author = {Lei Yu and
Laurent Sartran and
Wojciech Stokowiec and
Wang Ling and
Lingpeng Kong and
Phil Blunsom and
Chris Dyer},
title = {Putting Machine Translation in Context with the Noisy Channel Model},
journal = {CoRR},
volume = {abs/1910.00553},
year = {2019},
//url = {http://arxiv.org/abs/1910.00553},
//archivePrefix = {arXiv},
//eprint = {1910.00553},
//timestamp = {Fri, 04 Oct 2019 12:28:06 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1910-00553.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-2003-05259,
author = {Elman Mansimov and
G{\'{a}}bor Melis and
Lei Yu},
title = {Capturing document context inside sentence-level neural machine translation
models with self-training},
journal = {CoRR},
volume = {abs/2003.05259},
year = {2020},
//url = {https://arxiv.org/abs/2003.05259},
//archivePrefix = {arXiv},
//eprint = {2003.05259},
//timestamp = {Tue, 17 Mar 2020 14:18:27 +0100},
//biburl = {https://dblp.org/rec/journals/corr/abs-2003-05259.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/Ney99,
author = {Hermann Ney},
title = {Speech translation: coupling of recognition and translation},
booktitle = {Proceedings of the 1999 {IEEE} International Conference on Acoustics,
Speech, and Signal Processing, {ICASSP} '99, Phoenix, Arizona, USA,
March 15-19, 1999},
pages = {517--520},
publisher = {{IEEE} Computer Society},
year = {1999},
//url = {https:////doi.org/10.1109/ICASSP.1999.758176},
//doi = {10.1109/ICASSP.1999.758176},
//timestamp = {Wed, 16 Oct 2019 14:14:52 +0200},
//biburl = {https://dblp.org/rec/conf/icassp/Ney99.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/MatusovKN05,
author = {Evgeny Matusov and
Stephan Kanthak and
Hermann Ney},
title = {On the integration of speech recognition and statistical machine translation},
booktitle = {{INTERSPEECH} 2005 - Eurospeech, 9th European Conference on Speech
Communication and Technology, Lisbon, Portugal, September 4-8, 2005},
pages = {3177--3180},
publisher = {{ISCA}},
year = {2005},
//url = {http://www.isca-speech.org/archive/interspeech\_2005/i05\_3177.html},
//timestamp = {Sun, 13 Mar 2011 19:32:29 +0100},
//biburl = {https://dblp.org/rec/conf/interspeech/MatusovKN05.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/DuongACBC16,
author = {Long Duong and
Antonios Anastasopoulos and
David Chiang and
Steven Bird and
Trevor Cohn},
//editor = {Kevin Knight and
Ani Nenkova and
Owen Rambow},
title = {An Attentional Model for Speech Translation Without Transcription},
booktitle = {{NAACL} {HLT} 2016, The 2016 Conference of the North American Chapter
of the Association for Computational Linguistics: Human Language Technologies,
San Diego California, USA, June 12-17, 2016},
pages = {949--959},
publisher = {The Association for Computational Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/n16-1109},
//doi = {10.18653/v1/n16-1109},
//timestamp = {Tue, 28 Jan 2020 10:29:52 +0100},
//biburl = {https://dblp.org/rec/conf/naacl/DuongACBC16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/BerardPSB16,
author = {Alexandre Berard and
Olivier Pietquin and
Christophe Servan and
Laurent Besacier},
title = {Listen and Translate: {A} Proof of Concept for End-to-End Speech-to-Text
Translation},
journal = {CoRR},
volume = {abs/1612.01744},
year = {2016},
//url = {http://arxiv.org/abs/1612.01744},
//archivePrefix = {arXiv},
//eprint = {1612.01744},
//timestamp = {Mon, 13 Aug 2018 16:47:16 +0200},
//biburl = {https://dblp.org/rec/journals/corr/BerardPSB16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/BansalKLLG19,
author = {Sameer Bansal and
Herman Kamper and
Karen Livescu and
Adam Lopez and
Sharon Goldwater},
//editor = {Jill Burstein and
Christy Doran and
Thamar Solorio},
title = {Pre-training on high-resource speech recognition improves low-resource
speech-to-text translation},
booktitle = {Proceedings of the 2019 Conference of the North American Chapter of
the Association for Computational Linguistics: Human Language Technologies,
{NAACL-HLT} 2019, Minneapolis, MN, USA, June 2-7, 2019, Volume 1 (Long
and Short Papers)},
pages = {58--68},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/n19-1006},
//doi = {10.18653/v1/n19-1006},
//timestamp = {Tue, 28 Jan 2020 10:30:28 +0100},
//biburl = {https://dblp.org/rec/conf/naacl/BansalKLLG19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/BerardBKP18,
author = {Alexandre Berard and
Laurent Besacier and
Ali Can Kocabiyikoglu and
Olivier Pietquin},
title = {End-to-End Automatic Speech Translation of Audiobooks},
booktitle = {2018 {IEEE} International Conference on Acoustics, Speech and Signal
Processing, {ICASSP} 2018, Calgary, AB, Canada, April 15-20, 2018},
pages = {6224--6228},
publisher = {{IEEE}},
year = {2018},
//url = {https:////doi.org/10.1109/ICASSP.2018.8461690},
//doi = {10.1109/ICASSP.2018.8461690},
//timestamp = {Wed, 16 Oct 2019 14:14:52 +0200},
//biburl = {https://dblp.org/rec/conf/icassp/BerardBKP18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1802-06003,
author = {Takatomo Kano and
Sakriani Sakti and
Satoshi Nakamura},
title = {Structured-based Curriculum Learning for End-to-end English-Japanese
Speech Translation},
journal = {CoRR},
volume = {abs/1802.06003},
year = {2018},
//url = {http://arxiv.org/abs/1802.06003},
//archivePrefix = {arXiv},
//eprint = {1802.06003},
//timestamp = {Mon, 13 Aug 2018 16:47:19 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1802-06003.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tacl/SperberNNW19,
author = {Matthias Sperber and
Graham Neubig and
Jan Niehues and
Alex Waibel},
title = {Attention-Passing Models for Robust and Data-Efficient End-to-End
Speech Translation},
journal = {Trans. Assoc. Comput. Linguistics},
volume = {7},
pages = {313--325},
year = {2019},
//url = {https://transacl.org/ojs/index.php/tacl/article/view/1628},
//timestamp = {Thu, 02 Apr 2020 08:34:53 +0200},
//biburl = {https://dblp.org/rec/journals/tacl/SperberNNW19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/interspeech/LiuXZHWWZ19,
author = {Yuchen Liu and
Hao Xiong and
Jiajun Zhang and
Zhongjun He and
Hua Wu and
Haifeng Wang and
Chengqing Zong},
//editor = {Gernot Kubin and
Zdravko Kacic},
title = {End-to-End Speech Translation with Knowledge Distillation},
booktitle = {Interspeech 2019, 20th Annual Conference of the International Speech
Communication Association, Graz, Austria, 15-19 September 2019},
pages = {1128--1132},
publisher = {{ISCA}},
year = {2019},
//url = {https:////doi.org/10.21437/Interspeech.2019-2582},
//doi = {10.21437/Interspeech.2019-2582},
//timestamp = {Fri, 24 Apr 2020 11:55:21 +0200},
//biburl = {https://dblp.org/rec/conf/interspeech/LiuXZHWWZ19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/icassp/JiaJMWCCALW19,
author = {Ye Jia and
Melvin Johnson and
Wolfgang Macherey and
Ron J. Weiss and
Yuan Cao and
Chung{-}Cheng Chiu and
Naveen Ari and
Stella Laurenzo and
Yonghui Wu},
title = {Leveraging Weakly Supervised Data to Improve End-to-end Speech-to-text
Translation},
booktitle = {{IEEE} International Conference on Acoustics, Speech and Signal Processing,
{ICASSP} 2019, Brighton, United Kingdom, May 12-17, 2019},
pages = {7180--7184},
publisher = {{IEEE}},
year = {2019},
//url = {https:////doi.org/10.1109/ICASSP.2019.8683343},
//doi = {10.1109/ICASSP.2019.8683343},
//timestamp = {Wed, 16 Oct 2019 14:14:52 +0200},
//biburl = {https://dblp.org/rec/conf/icassp/JiaJMWCCALW19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/tacl/JohnsonSLKWCTVW17,
author = {Melvin Johnson and
Mike Schuster and
Quoc V. Le and
Maxim Krikun and
Yonghui Wu and
Zhifeng Chen and
Nikhil Thorat and
Fernanda B. Vi{\'{e}}gas and
Martin Wattenberg and
Greg Corrado and
Macduff Hughes and
Jeffrey Dean},
title = {Google's Multilingual Neural Machine Translation System: Enabling
Zero-Shot Translation},
journal = {Trans. Assoc. Comput. Linguistics},
volume = {5},
pages = {339--351},
year = {2017},
//url = {https://transacl.org/ojs/index.php/tacl/article/view/1081},
//timestamp = {Thu, 02 Apr 2020 08:34:52 +0200},
//biburl = {https://dblp.org/rec/journals/tacl/JohnsonSLKWCTVW17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/HaNW16,
author = {Thanh{-}Le Ha and
Jan Niehues and
Alexander H. Waibel},
title = {Toward Multilingual Neural Machine Translation with Universal Encoder
and Decoder},
journal = {CoRR},
volume = {abs/1611.04798},
year = {2016},
//url = {http://arxiv.org/abs/1611.04798},
//archivePrefix = {arXiv},
//eprint = {1611.04798},
//timestamp = {Mon, 13 Aug 2018 16:46:26 +0200},
//biburl = {https://dblp.org/rec/journals/corr/HaNW16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/corr/abs-1711-07893,
author = {Thanh{-}Le Ha and
Jan Niehues and
Alexander H. Waibel},
title = {Effective Strategies in Zero-Shot Neural Machine Translation},
journal = {CoRR},
volume = {abs/1711.07893},
year = {2017},
//url = {http://arxiv.org/abs/1711.07893},
//archivePrefix = {arXiv},
//eprint = {1711.07893},
//timestamp = {Mon, 13 Aug 2018 16:46:07 +0200},
//biburl = {https://dblp.org/rec/journals/corr/abs-1711-07893.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/FiratCB16,
author = {Orhan Firat and
Kyunghyun Cho and
Yoshua Bengio},
//editor = {Kevin Knight and
Ani Nenkova and
Owen Rambow},
title = {Multi-Way, Multilingual Neural Machine Translation with a Shared Attention
Mechanism},
booktitle = {{NAACL} {HLT} 2016, The 2016 Conference of the North American Chapter
of the Association for Computational Linguistics: Human Language Technologies,
San Diego California, USA, June 12-17, 2016},
pages = {866--875},
publisher = {The Association for Computational Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/n16-1101},
//doi = {10.18653/v1/n16-1101},
//timestamp = {Tue, 28 Jan 2020 10:30:10 +0100},
//biburl = {https://dblp.org/rec/conf/naacl/FiratCB16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{luong2015multi-task,
title={Multi-task Sequence to Sequence Learning},
author={Luong, Minhthang and Le, Quoc V and Sutskever, Ilya and Vinyals, Oriol and Kaiser, Lukasz},
journal={arXiv: Learning},
year={2015}}
@article{Och2001Statistical,
title={Statistical multi-source translation},
author={Och, Franz Josef and Ney, Hermann},
journal={Mt Summit},
year={2001},
}
@article{DBLP:journals/jmlr/ElskenMH19,
author = {Thomas Elsken and
Jan Hendrik Metzen and
Frank Hutter},
title = {Neural Architecture Search: {A} Survey},
journal = {J. Mach. Learn. Res.},
volume = {20},
pages = {55:1--55:21},
year = {2019},
//url = {http://jmlr.org/papers/v20/18-598.html},
//timestamp = {Wed, 10 Jul 2019 15:28:24 +0200},
//biburl = {https://dblp.org/rec/journals/jmlr/ElskenMH19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/iclr/ZophL17,
author = {Barret Zoph and
Quoc V. Le},
title = {Neural Architecture Search with Reinforcement Learning},
booktitle = {5th International Conference on Learning Representations, {ICLR} 2017,
Toulon, France, April 24-26, 2017, Conference Track Proceedings},
publisher = {OpenReview.net},
year = {2017},
//url = {https://openreview.net/forum?id=r1Ue8Hcxg},
//timestamp = {Thu, 04 Apr 2019 13:20:08 +0200},
//biburl = {https://dblp.org/rec/conf/iclr/ZophL17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/emnlp/JiangHXZZ19,
author = {Yufan Jiang and
Chi Hu and
Tong Xiao and
Chunliang Zhang and
Jingbo Zhu},
//editor = {Kentaro Inui and
Jing Jiang and
Vincent Ng and
Xiaojun Wan},
title = {Improved Differentiable Architecture Search for Language Modeling
and Named Entity Recognition},
booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural
Language Processing and the 9th International Joint Conference on
Natural Language Processing, {EMNLP-IJCNLP} 2019, Hong Kong, China,
November 3-7, 2019},
pages = {3583--3588},
publisher = {Association for Computational Linguistics},
year = {2019},
//url = {https:////doi.org/10.18653/v1/D19-1367},
//doi = {10.18653/v1/D19-1367},
//timestamp = {Thu, 12 Dec 2019 13:23:52 +0100},
//biburl = {https://dblp.org/rec/conf/emnlp/JiangHXZZ19.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{liyinqiaoESS,
author = {Yinqiao Li and
Chi Hu and
Yuhao Zhang and
Nuo Xu and
Yufan Jiang and
Tong Xiao and
Jingbo Zhu and
Tongran Liu and
Changliang Li},
title = {Learning Architectures from an Extended Search Space
for Language Modeling},
publisher = {Association for Computational Linguistics},
year = {2020},
}
@article{Luo2018Neural,
title={Neural Architecture Optimization},
author={Luo, Renqian and Tian, Fei and Qin, Tao and Liu, Tie-Yan},
year={2018},
}
@inproceedings{DBLP:conf/aclnmt/KoehnK17,
author = {Philipp Koehn and
Rebecca Knowles},
//editor = {Thang Luong and
Alexandra Birch and
Graham Neubig and
Andrew M. Finch},
title = {Six Challenges for Neural Machine Translation},
booktitle = {Proceedings of the First Workshop on Neural Machine Translation, NMT@ACL
2017, Vancouver, Canada, August 4, 2017},
pages = {28--39},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https:////doi.org/10.18653/v1/w17-3204},
//doi = {10.18653/v1/w17-3204},
//timestamp = {Tue, 28 Jan 2020 10:28:06 +0100},
//biburl = {https://dblp.org/rec/conf/aclnmt/KoehnK17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aaai/HeHWW16,
author = {Wei He and
Zhongjun He and
Hua Wu and
Haifeng Wang},
//editor = {Dale Schuurmans and
Michael P. Wellman},
title = {Improved Neural Machine Translation with {SMT} Features},
booktitle = {Proceedings of the Thirtieth {AAAI} Conference on Artificial Intelligence,
February 12-17, 2016, Phoenix, Arizona, {USA}},
pages = {151--157},
publisher = {{AAAI} Press},
year = {2016},
//url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI16/paper/view/12189},
//timestamp = {Thu, 17 Oct 2019 16:06:14 +0200},
//biburl = {https://dblp.org/rec/conf/aaai/HeHWW16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/ijcnlp/KhayrallahKDPK17,
author = {Huda Khayrallah and
Gaurav Kumar and
Kevin Duh and
Matt Post and
Philipp Koehn},
//editor = {Greg Kondrak and
Taro Watanabe},
title = {Neural Lattice Search for Domain Adaptation in Machine Translation},
booktitle = {Proceedings of the Eighth International Joint Conference on Natural
Language Processing, {IJCNLP} 2017, Taipei, Taiwan, November 27 -
December 1, 2017, Volume 2: Short Papers},
pages = {20--25},
publisher = {Asian Federation of Natural Language Processing},
year = {2017},
//url = {https://www.aclweb.org/anthology/I17-2004/},
//timestamp = {Tue, 17 Sep 2019 17:11:58 +0200},
//biburl = {https://dblp.org/rec/conf/ijcnlp/KhayrallahKDPK17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/acl/StahlbergHWB16,
author = {Felix Stahlberg and
Eva Hasler and
Aurelien Waite and
Bill Byrne},
title = {Syntactically Guided Neural Machine Translation},
booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational
Linguistics, {ACL} 2016, August 7-12, 2016, Berlin, Germany, Volume
2: Short Papers},
publisher = {The Association for Computer Linguistics},
year = {2016},
//url = {https:////doi.org/10.18653/v1/p16-2049},
//doi = {10.18653/v1/p16-2049},
//timestamp = {Tue, 28 Jan 2020 10:27:31 +0100},
//biburl = {https://dblp.org/rec/conf/acl/StahlbergHWB16.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/aclwat/NeubigMN15,
author = {Graham Neubig and
Makoto Morishita and
Satoshi Nakamura},
//editor = {Toshiaki Nakazawa and
Hideya Mino and
Isao Goto and
Graham Neubig and
Sadao Kurohashi and
Eiichiro Sumita},
title = {Neural Reranking Improves Subjective Quality of Machine Translation:
{NAIST} at {WAT2015}},
booktitle = {Proceedings of the 2nd Workshop on Asian Translation, {WAT} 2015,
Kyoto, Japan, October 16, 2015},
pages = {35--41},
publisher = {Workshop on Asian Translation},
year = {2015},
//url = {https://www.aclweb.org/anthology/W15-5003/},
//timestamp = {Tue, 17 Sep 2019 17:11:58 +0200},
//biburl = {https://dblp.org/rec/conf/aclwat/NeubigMN15.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/naacl/GrundkiewiczJ18,
author = {Roman Grundkiewicz and
Marcin Junczys{-}Dowmunt},
//editor = {Marilyn A. Walker and
Heng Ji and
Amanda Stent},
title = {Near Human-Level Performance in Grammatical Error Correction with
Hybrid Machine Translation},
booktitle = {Proceedings of the 2018 Conference of the North American Chapter of
the Association for Computational Linguistics: Human Language Technologies,
NAACL-HLT, New Orleans, Louisiana, USA, June 1-6, 2018, Volume 2 (Short
Papers)},
pages = {284--290},
publisher = {Association for Computational Linguistics},
year = {2018},
//url = {https:////doi.org/10.18653/v1/n18-2046},
//doi = {10.18653/v1/n18-2046},
//timestamp = {Tue, 28 Jan 2020 10:30:23 +0100},
//biburl = {https://dblp.org/rec/conf/naacl/GrundkiewiczJ18.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{niehues-etal-2016-pre,
title = "Pre-Translation for Neural Machine Translation",
author = "Niehues, Jan and
Cho, Eunah and
Ha, Thanh-Le and
Waibel, Alex",
booktitle = "Proceedings of {COLING} 2016, the 26th International Conference on Computational Linguistics: Technical Papers",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
//url = "https://www.aclweb.org/anthology/C16-1172",
pages = "1828--1836",
}
@inproceedings{DBLP:conf/eacl/GispertBHS17,
author = {Felix Stahlberg and
Adri{\`{a}} de Gispert and
Eva Hasler and
Bill Byrne},
//editor = {Mirella Lapata and
Phil Blunsom and
Alexander Koller},
title = {Neural Machine Translation by Minimising the Bayes-risk with Respect
to Syntactic Translation Lattices},
booktitle = {Proceedings of the 15th Conference of the European Chapter of the
Association for Computational Linguistics, {EACL} 2017, Valencia,
Spain, April 3-7, 2017, Volume 2: Short Papers},
pages = {362--368},
publisher = {Association for Computational Linguistics},
year = {2017},
//url = {https:////doi.org/10.18653/v1/e17-2058},
//doi = {10.18653/v1/e17-2058},
//timestamp = {Wed, 29 Jan 2020 15:40:22 +0100},
//biburl = {https://dblp.org/rec/conf/eacl/GispertBHS17.bib},
//bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{Domingo2017Segment,
title={Segment-based interactive-predictive machine translation},
author={Domingo, Miguel and álvaro Peris and Casacuberta, Francisco},
journal={Machine Translation},
volume={31},
number={4},
pages={163-185},
year={2017},
}
@article{Alvaro2017Interactive,
title={Interactive neural machine translation},
author={Alvaro Peris and Domingo, Miguel and Casacuberta, Francisco},
journal={Computer Speech & Language},
volume={45},
number={Sep.},
pages={201-220},
year={2017},
}
@inproceedings{Nepveu2004Adaptive,
title={Adaptive Language and Translation Models for Interactive Machine Translation},
author={Nepveu, Laurent and Lapalme, Guy and Langlais, Philippe and Foster, George F.},
booktitle={Conference on Empirical Methods in Natural Language Processing},
year={2004},
}
\ No newline at end of file
\indexentry{Chapter4.1|hyperpage}{7}
\indexentry{Chapter4.1.1|hyperpage}{8}
\indexentry{Chapter4.1.2|hyperpage}{10}
\indexentry{Chapter4.2|hyperpage}{12}
\indexentry{Chapter4.2.1|hyperpage}{12}
\indexentry{Chapter4.2.2|hyperpage}{15}
\indexentry{Chapter4.2.2.1|hyperpage}{15}
\indexentry{Chapter4.2.2.2|hyperpage}{16}
\indexentry{Chapter4.2.2.3|hyperpage}{17}
\indexentry{Chapter4.2.3|hyperpage}{18}
\indexentry{Chapter4.2.3.1|hyperpage}{19}
\indexentry{Chapter4.2.3.2|hyperpage}{20}
\indexentry{Chapter4.2.3.3|hyperpage}{21}
\indexentry{Chapter4.2.4|hyperpage}{22}
\indexentry{Chapter4.2.4.1|hyperpage}{22}
\indexentry{Chapter4.2.4.2|hyperpage}{23}
\indexentry{Chapter4.2.4.3|hyperpage}{25}
\indexentry{Chapter4.2.5|hyperpage}{25}
\indexentry{Chapter4.2.6|hyperpage}{26}
\indexentry{Chapter4.2.7|hyperpage}{29}
\indexentry{Chapter4.2.7.1|hyperpage}{30}
\indexentry{Chapter4.2.7.2|hyperpage}{30}
\indexentry{Chapter4.2.7.3|hyperpage}{31}
\indexentry{Chapter4.2.7.4|hyperpage}{33}
\indexentry{Chapter4.3|hyperpage}{34}
\indexentry{Chapter4.3.1|hyperpage}{36}
\indexentry{Chapter4.3.1.1|hyperpage}{37}
\indexentry{Chapter4.3.1.2|hyperpage}{38}
\indexentry{Chapter4.3.1.3|hyperpage}{39}
\indexentry{Chapter4.3.1.4|hyperpage}{40}
\indexentry{Chapter4.3.2|hyperpage}{40}
\indexentry{Chapter4.3.3|hyperpage}{41}
\indexentry{Chapter4.3.4|hyperpage}{42}
\indexentry{Chapter4.3.5|hyperpage}{46}
\indexentry{Chapter4.4|hyperpage}{49}
\indexentry{Chapter4.4.1|hyperpage}{51}
\indexentry{Chapter4.4.2|hyperpage}{51}
\indexentry{Chapter4.4.2.1|hyperpage}{53}
\indexentry{Chapter4.4.2.2|hyperpage}{55}
\indexentry{Chapter4.4.2.3|hyperpage}{57}
\indexentry{Chapter4.4.3|hyperpage}{58}
\indexentry{Chapter4.4.3.1|hyperpage}{58}
\indexentry{Chapter4.4.3.2|hyperpage}{61}
\indexentry{Chapter4.4.3.3|hyperpage}{62}
\indexentry{Chapter4.4.3.4|hyperpage}{63}
\indexentry{Chapter4.4.3.5|hyperpage}{64}
\indexentry{Chapter4.4.4|hyperpage}{66}
\indexentry{Chapter4.4.4.1|hyperpage}{66}
\indexentry{Chapter4.4.4.2|hyperpage}{67}
\indexentry{Chapter4.4.5|hyperpage}{68}
\indexentry{Chapter4.4.5|hyperpage}{69}
\indexentry{Chapter4.4.7|hyperpage}{73}
\indexentry{Chapter4.4.7.1|hyperpage}{73}
\indexentry{Chapter4.4.7.2|hyperpage}{74}
\indexentry{Chapter4.5|hyperpage}{76}
\indexentry{未登录词|hyperpage}{11}
\indexentry{Out of Vocabulary Word,OOV Word|hyperpage}{11}
\indexentry{子词切分|hyperpage}{11}
\indexentry{Sub-word Segmentation|hyperpage}{11}
\indexentry{标准化|hyperpage}{11}
\indexentry{Normalization|hyperpage}{11}
\indexentry{数据清洗|hyperpage}{11}
\indexentry{Dada Cleaning|hyperpage}{11}
\indexentry{数据选择|hyperpage}{13}
\indexentry{Data Selection|hyperpage}{13}
\indexentry{数据过滤|hyperpage}{13}
\indexentry{Data Filtering|hyperpage}{13}
\indexentry{开放词表|hyperpage}{16}
\indexentry{Open-Vocabulary|hyperpage}{16}
\indexentry{子词|hyperpage}{17}
\indexentry{Sub-word|hyperpage}{17}
\indexentry{字节对编码|hyperpage}{17}
\indexentry{双字节编码|hyperpage}{17}
\indexentry{Byte Pair Encoding,BPE|hyperpage}{17}
\indexentry{正则化|hyperpage}{20}
\indexentry{Regularization|hyperpage}{20}
\indexentry{过拟合问题|hyperpage}{20}
\indexentry{Overfitting Problem|hyperpage}{20}
\indexentry{反问题|hyperpage}{20}
\indexentry{Inverse Problem|hyperpage}{20}
\indexentry{适定的|hyperpage}{20}
\indexentry{Well-posed|hyperpage}{20}
\indexentry{不适定问题|hyperpage}{20}
\indexentry{Ill-posed Problem|hyperpage}{20}
\indexentry{降噪|hyperpage}{21}
\indexentry{Denoising|hyperpage}{21}
\indexentry{泛化|hyperpage}{21}
\indexentry{Generalization|hyperpage}{21}
\indexentry{标签平滑|hyperpage}{23}
\indexentry{Label Smoothing|hyperpage}{23}
\indexentry{相互适应|hyperpage}{24}
\indexentry{Co-Adaptation|hyperpage}{24}
\indexentry{集成学习|hyperpage}{25}
\indexentry{Ensemble Learning|hyperpage}{25}
\indexentry{容量|hyperpage}{26}
\indexentry{Capacity|hyperpage}{26}
\indexentry{宽残差网络|hyperpage}{27}
\indexentry{Wide Residual Network|hyperpage}{27}
\indexentry{探测任务|hyperpage}{28}
\indexentry{Probing Task|hyperpage}{28}
\indexentry{表面信息|hyperpage}{28}
\indexentry{Surface Information|hyperpage}{28}
\indexentry{语法信息|hyperpage}{28}
\indexentry{Syntactic Information|hyperpage}{28}
\indexentry{语义信息|hyperpage}{28}
\indexentry{Semantic Information|hyperpage}{28}
\indexentry{词嵌入|hyperpage}{29}
\indexentry{Embedding|hyperpage}{29}
\indexentry{数据并行|hyperpage}{29}
\indexentry{Data Parallelism|hyperpage}{29}
\indexentry{模型并行|hyperpage}{29}
\indexentry{Model Parallelism|hyperpage}{29}
\indexentry{小批量训练|hyperpage}{29}
\indexentry{Mini-batch Training|hyperpage}{29}
\indexentry{课程学习|hyperpage}{31}
\indexentry{Curriculum Learning|hyperpage}{31}
\indexentry{推断|hyperpage}{32}
\indexentry{Inference|hyperpage}{32}
\indexentry{解码|hyperpage}{32}
\indexentry{Decoding|hyperpage}{32}
\indexentry{搜索错误|hyperpage}{32}
\indexentry{Search Error|hyperpage}{32}
\indexentry{模型错误|hyperpage}{32}
\indexentry{Modeling Error|hyperpage}{32}
\indexentry{重排序|hyperpage}{34}
\indexentry{Re-ranking|hyperpage}{34}
\indexentry{双向推断|hyperpage}{34}
\indexentry{Bidirectional Inference|hyperpage}{34}
\indexentry{批量推断|hyperpage}{38}
\indexentry{Batch Inference|hyperpage}{38}
\indexentry{批量处理|hyperpage}{38}
\indexentry{Batching|hyperpage}{38}
\indexentry{二值网络|hyperpage}{39}
\indexentry{Binarized Neural Networks|hyperpage}{39}
\indexentry{自回归翻译|hyperpage}{40}
\indexentry{Autoregressive Translation|hyperpage}{40}
\indexentry{非自回归翻译|hyperpage}{40}
\indexentry{Regressive Translation|hyperpage}{40}
\indexentry{繁衍率|hyperpage}{40}
\indexentry{Fertility|hyperpage}{40}
\indexentry{偏置|hyperpage}{41}
\indexentry{Bias|hyperpage}{41}
\indexentry{退化|hyperpage}{42}
\indexentry{Degenerate|hyperpage}{42}
\indexentry{过翻译|hyperpage}{43}
\indexentry{Over Translation|hyperpage}{43}
\indexentry{欠翻译|hyperpage}{43}
\indexentry{Under Translation|hyperpage}{43}
\indexentry{充分性|hyperpage}{44}
\indexentry{Adequacy|hyperpage}{44}
\indexentry{系统融合|hyperpage}{44}
\indexentry{System Combination|hyperpage}{44}
\indexentry{假设选择|hyperpage}{45}
\indexentry{Hypothesis Selection|hyperpage}{45}
\indexentry{多样性|hyperpage}{45}
\indexentry{Diversity|hyperpage}{45}
\indexentry{重排序|hyperpage}{46}
\indexentry{Re-ranking|hyperpage}{46}
\indexentry{混淆网络|hyperpage}{47}
\indexentry{Confusion Network|hyperpage}{47}
\indexentry{动态线性层聚合方法|hyperpage}{51}
\indexentry{Dynamic Linear Combination of Layers,DLCL|hyperpage}{51}
\indexentry{相互适应|hyperpage}{55}
\indexentry{Co-adaptation|hyperpage}{55}
\indexentry{数据增强|hyperpage}{57}
\indexentry{Data Augmentation|hyperpage}{57}
\indexentry{回译|hyperpage}{57}
\indexentry{Back Translation|hyperpage}{57}
\indexentry{迭代式回译|hyperpage}{58}
\indexentry{Iterative Back Translation|hyperpage}{58}
\indexentry{前向翻译|hyperpage}{59}
\indexentry{Forward Translation|hyperpage}{59}
\indexentry{分布式表示|hyperpage}{59}
\indexentry{Distributed Representation|hyperpage}{59}
\indexentry{预训练|hyperpage}{59}
\indexentry{Pre-training|hyperpage}{59}
\indexentry{微调|hyperpage}{59}
\indexentry{Fine-tuning|hyperpage}{59}
\indexentry{多任务学习|hyperpage}{61}
\indexentry{Multitask Learning|hyperpage}{61}
\indexentry{知识精炼|hyperpage}{62}
\indexentry{Knowledge Distillation|hyperpage}{62}
\indexentry{模型压缩|hyperpage}{62}
\indexentry{Model Compression|hyperpage}{62}
\indexentry{学习难度|hyperpage}{62}
\indexentry{Learning Difficulty|hyperpage}{62}
\indexentry{教师模型|hyperpage}{63}
\indexentry{Teacher Model|hyperpage}{63}
\indexentry{学生模型|hyperpage}{63}
\indexentry{Student Model|hyperpage}{63}
\indexentry{基于单词的知识精炼|hyperpage}{63}
\indexentry{Word-level Knowledge Distillation|hyperpage}{63}
\indexentry{基于序列的知识精炼|hyperpage}{63}
\indexentry{Sequence-level Knowledge Distillation|hyperpage}{63}
\indexentry{中间层输出|hyperpage}{64}
\indexentry{Hint-based Knowledge Transfer|hyperpage}{64}
\indexentry{注意力分布|hyperpage}{64}
\indexentry{Attention To Attention Transfer|hyperpage}{65}
\indexentry{循环一致性|hyperpage}{67}
\indexentry{Circle Consistency|hyperpage}{67}
\indexentry{翻译中回译|hyperpage}{68}
\indexentry{On-the-fly Back-translation|hyperpage}{68}
\indexentry{神经架构搜索|hyperpage}{71}
\indexentry{Neural Architecture Search|hyperpage}{71}
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
\defcounter {refsection}{0}\relax
\select@language {english}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {I}{机器翻译基础}}{13}{part.1}
\ttl@starttoc {default@1}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {1}机器翻译简介}{15}{chapter.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.1}机器翻译的概念}{15}{section.1.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.2}机器翻译简史}{18}{section.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.1}人工翻译}{18}{subsection.1.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.2}机器翻译的萌芽}{19}{subsection.1.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.3}机器翻译的受挫}{20}{subsection.1.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.4}机器翻译的快速成长}{21}{subsection.1.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.5}机器翻译的爆发}{22}{subsection.1.2.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.3}机器翻译现状}{23}{section.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.4}机器翻译方法}{24}{section.1.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.1}基于规则的机器翻译}{26}{subsection.1.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.2}基于实例的机器翻译}{26}{subsection.1.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.3}统计机器翻译}{27}{subsection.1.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.4}神经机器翻译}{28}{subsection.1.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.5}对比分析}{29}{subsection.1.4.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.5}翻译质量评价}{30}{section.1.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.1}人工评价}{30}{subsection.1.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.2}自动评价}{31}{subsection.1.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{BLEU}{31}{section*.15}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{TER}{33}{section*.16}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于检测点的评价}{33}{section*.17}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.6}机器翻译应用}{34}{section.1.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.7}开源项目与评测}{36}{section.1.7}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.1}开源机器翻译系统}{36}{subsection.1.7.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{统计机器翻译开源系统}{37}{section*.19}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经机器翻译开源系统}{38}{section*.20}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.2}常用数据集及公开评测任务}{40}{subsection.1.7.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.8}推荐学习资源}{42}{section.1.8}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {2}词法、语法及统计建模基础}{47}{chapter.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.1}问题概述 }{48}{section.2.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.2}概率论基础}{49}{section.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.1}随机变量和概率}{50}{subsection.2.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.2}联合概率、条件概率和边缘概率}{51}{subsection.2.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.3}链式法则}{52}{subsection.2.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.4}贝叶斯法则}{53}{subsection.2.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.5}KL距离和熵}{55}{subsection.2.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{信息熵}{55}{section*.27}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{KL距离}{56}{section*.29}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{交叉熵}{56}{section*.30}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.3}中文分词}{57}{section.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.1}基于词典的分词方法}{58}{subsection.2.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.2}基于统计的分词方法}{59}{subsection.2.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{统计模型的学习与推断}{59}{section*.34}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{掷骰子游戏}{60}{section*.36}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{全概率分词方法}{62}{section*.40}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.4}$n$-gram语言模型 }{64}{section.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.4.1}建模}{65}{subsection.2.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.4.2}未登录词和平滑算法}{67}{subsection.2.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{加法平滑方法}{68}{section*.46}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{古德-图灵估计法}{69}{section*.48}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Kneser-Ney平滑方法}{70}{section*.50}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.5}句法分析(短语结构分析)}{72}{section.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.1}句子的句法树表示}{72}{subsection.2.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.2}上下文无关文法}{74}{subsection.2.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.3}规则和推导的概率}{78}{subsection.2.5.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.6}小结及深入阅读}{80}{section.2.6}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {II}{统计机器翻译}}{83}{part.2}
\ttl@stoptoc {default@1}
\ttl@starttoc {default@2}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {3}基于词的机器翻译模型}{85}{chapter.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.1}什么是基于词的翻译模型}{85}{section.3.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.2}构建一个简单的机器翻译系统}{87}{section.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.1}如何进行翻译?}{87}{subsection.3.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{机器翻译流程}{88}{section*.63}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{89}{section*.65}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.2}基本框架}{89}{subsection.3.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.3}单词翻译概率}{90}{subsection.3.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{什么是单词翻译概率?}{90}{section*.67}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从一个双语平行数据中学习?}{90}{section*.69}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从大量的双语平行数据中学习?}{92}{section*.70}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.4}句子级翻译模型}{93}{subsection.3.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基础模型}{93}{section*.72}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{生成流畅的译文}{95}{section*.74}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.5}解码}{97}{subsection.3.2.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.3}基于词的翻译建模}{100}{section.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.1}噪声信道模型}{100}{subsection.3.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.2}统计机器翻译的三个基本问题}{102}{subsection.3.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐}{103}{section*.83}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译模型}{103}{section*.86}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译实例}{105}{section*.88}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.4}IBM模型1-2}{106}{section.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.1}IBM模型1}{106}{subsection.3.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.2}IBM模型2}{108}{subsection.3.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.3}解码及计算优化}{109}{subsection.3.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.4}训练}{110}{subsection.3.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{目标函数}{110}{section*.93}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{优化}{111}{section*.95}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.5}IBM模型3-5及隐马尔可夫模型}{117}{section.3.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.1}基于产出率的翻译模型}{117}{subsection.3.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.2}IBM 模型3}{120}{subsection.3.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.3}IBM 模型4}{121}{subsection.3.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.4} IBM 模型5}{123}{subsection.3.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.5}隐马尔可夫模型}{124}{subsection.3.5.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{隐马尔可夫模型}{125}{section*.107}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐模型}{126}{section*.109}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.6}解码和训练}{127}{subsection.3.5.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.6}问题分析}{127}{section.3.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.1}词对齐及对称化}{127}{subsection.3.6.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.2}Deficiency}{128}{subsection.3.6.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.3}句子长度}{129}{subsection.3.6.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.4}其他问题}{130}{subsection.3.6.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.7}小结及深入阅读}{130}{section.3.7}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {4}基于短语和句法的机器翻译模型}{133}{chapter.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.1}翻译中的结构信息}{133}{section.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.1}更大粒度的翻译单元}{134}{subsection.4.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.2}句子的结构信息}{136}{subsection.4.1.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.2}基于短语的翻译模型}{138}{section.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.1}机器翻译中的短语}{138}{subsection.4.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.2}数学建模及判别式模型}{141}{subsection.4.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于翻译推导的建模}{141}{section*.121}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{对数线性模型}{142}{section*.122}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{搭建模型的基本流程}{143}{section*.123}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.3}短语抽取}{144}{subsection.4.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{与词对齐一致的短语}{145}{section*.126}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{获取词对齐}{146}{section*.130}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{度量双语短语质量}{147}{section*.132}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.4}调序}{148}{subsection.4.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于距离的调序}{148}{section*.136}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于方向的调序}{149}{section*.138}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于分类的调序}{151}{section*.141}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.5}特征}{151}{subsection.4.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.6}最小错误率训练}{152}{subsection.4.2.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.7}栈解码}{155}{subsection.4.2.7}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译候选匹配}{156}{section*.146}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译假设扩展}{156}{section*.148}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{剪枝}{157}{section*.150}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{解码中的栈结构}{159}{section*.152}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.3}基于层次短语的模型}{160}{section.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.1}同步上下文无关文法}{163}{subsection.4.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{文法定义}{163}{section*.157}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{推导}{164}{section*.158}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{胶水规则}{165}{section*.159}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{处理流程}{166}{section*.160}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.2}层次短语规则抽取}{166}{subsection.4.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.3}翻译模型及特征}{168}{subsection.4.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.4}CYK解码}{169}{subsection.4.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.5}立方剪枝}{172}{subsection.4.3.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.4}基于语言学句法的模型}{175}{section.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.1}基于句法的翻译模型分类}{177}{subsection.4.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.2}基于树结构的文法}{177}{subsection.4.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树到树翻译规则}{179}{section*.176}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于树结构的翻译推导}{181}{section*.178}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树到串翻译规则}{183}{section*.181}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.3}树到串翻译规则抽取}{184}{subsection.4.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树的切割与最小规则}{185}{section*.183}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{空对齐处理}{188}{section*.189}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{组合规则}{189}{section*.191}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{SPMT规则}{190}{section*.193}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{句法树二叉化}{191}{section*.195}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.4}树到树翻译规则抽取}{192}{subsection.4.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于节点对齐的规则抽取}{193}{section*.199}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于对齐矩阵的规则抽取}{194}{section*.202}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.5}句法翻译模型的特征}{196}{subsection.4.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.6}基于超图的推导空间表示}{197}{subsection.4.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.7}基于树的解码 vs 基于串的解码}{199}{subsection.4.4.7}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于树的解码}{201}{section*.209}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于串的解码}{202}{section*.212}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.5}小结及深入阅读}{204}{section.4.5}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {III}{神经机器翻译}}{207}{part.3}
\ttl@stoptoc {default@2}
\ttl@starttoc {default@3}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {5}人工神经网络和神经语言建模}{209}{chapter.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.1}深度学习与人工神经网络}{210}{section.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.1}发展简史}{210}{subsection.5.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{早期的人工神经网络和第一次寒冬}{210}{section*.214}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经网络的第二次高潮和第二次寒冬}{211}{section*.215}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深度学习和神经网络方法的崛起}{212}{section*.216}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.2}为什么需要深度学习}{213}{subsection.5.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{端到端学习和表示学习}{213}{section*.218}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深度学习的效果}{214}{section*.220}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.2}神经网络基础}{214}{section.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.1}线性代数基础}{214}{subsection.5.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{标量、向量和矩阵}{215}{section*.222}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵的转置}{216}{section*.223}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵加法和数乘}{216}{section*.224}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵乘法和矩阵点乘}{217}{section*.225}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{线性映射}{218}{section*.226}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{范数}{219}{section*.227}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.2}人工神经元和感知机}{220}{subsection.5.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{感知机\ \raisebox {0.5mm}{------}\ 最简单的人工神经元模型}{221}{section*.230}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元内部权重}{222}{section*.233}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元的输入\ \raisebox {0.5mm}{------}\ 离散 vs 连续}{223}{section*.235}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元内部的参数学习}{223}{section*.237}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.3}多层神经网络}{224}{subsection.5.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{线性变换和激活函数}{224}{section*.239}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{单层神经网络$\rightarrow $多层神经网络}{226}{section*.246}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.4}函数拟合能力}{227}{subsection.5.2.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.3}神经网络的张量实现}{231}{section.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.1} 张量及其计算}{232}{subsection.5.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{张量}{232}{section*.256}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{张量的矩阵乘法}{234}{section*.259}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{张量的单元操作}{235}{section*.261}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.2}张量的物理存储形式}{236}{subsection.5.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.3}使用开源框架实现张量计算}{236}{subsection.5.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.4}前向传播与计算图}{238}{subsection.5.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.5}神经网络实例}{241}{subsection.5.3.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.4}神经网络的参数训练}{242}{section.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.1}损失函数}{243}{subsection.5.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.2}基于梯度的参数优化}{243}{subsection.5.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度下降}{244}{section*.279}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度获取}{246}{section*.281}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于梯度的方法的变种和改进}{249}{section*.285}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.3}参数更新的并行化策略}{252}{subsection.5.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.4}梯度消失、梯度爆炸和稳定性训练}{254}{subsection.5.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{易于优化的激活函数}{254}{section*.288}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度裁剪}{255}{section*.292}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{稳定性训练}{256}{section*.293}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.5}过拟合}{257}{subsection.5.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.6}反向传播}{258}{subsection.5.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{输出层的反向传播}{259}{section*.296}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{隐藏层的反向传播}{261}{section*.300}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{程序实现}{262}{section*.303}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.5}神经语言模型}{264}{section.5.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.1}基于神经网络的语言建模}{264}{subsection.5.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于前馈神经网络的语言模型}{265}{section*.306}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于循环神经网络的语言模型}{267}{section*.309}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于自注意力机制的语言模型}{268}{section*.311}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{语言模型的评价}{269}{section*.313}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.2}单词表示模型}{270}{subsection.5.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{One-hot编码}{270}{section*.314}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{分布式表示}{270}{section*.316}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.3}句子表示模型及预训练}{272}{subsection.5.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{简单的上下文表示模型}{272}{section*.320}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{ELMO模型}{274}{section*.323}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{GPT模型}{274}{section*.325}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{BERT模型}{275}{section*.327}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{为什么要预训练?}{276}{section*.329}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.6}小结及深入阅读}{277}{section.5.6}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {6}神经机器翻译模型}{279}{chapter.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.1}神经机器翻译的发展简史}{279}{section.6.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.1}神经机器翻译的起源}{281}{subsection.6.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.2}神经机器翻译的品质 }{283}{subsection.6.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.3}神经机器翻译的优势 }{286}{subsection.6.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.2}编码器-解码器框架}{288}{section.6.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.1}框架结构}{288}{subsection.6.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.2}表示学习}{289}{subsection.6.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.3}简单的运行实例}{290}{subsection.6.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.4}机器翻译范式的对比}{291}{subsection.6.2.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.3}基于循环神经网络的翻译模型及注意力机制}{292}{section.6.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.1}建模}{292}{subsection.6.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.2}输入(词嵌入)及输出(Softmax)}{296}{subsection.6.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.3}循环神经网络结构}{300}{subsection.6.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{循环神经单元(RNN)}{300}{section*.351}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长短时记忆网络(LSTM)}{300}{section*.352}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{门控循环单元(GRU)}{302}{section*.355}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{双向模型}{304}{section*.357}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{多层循环神经网络}{304}{section*.359}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.4}注意力机制}{305}{subsection.6.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译中的注意力机制}{306}{section*.362}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{上下文向量的计算}{307}{section*.365}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{注意力机制的解读}{310}{section*.370}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.5}训练}{312}{subsection.6.3.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{损失函数}{312}{section*.373}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长参数初始化}{313}{section*.374}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{优化策略}{314}{section*.375}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度裁剪}{314}{section*.377}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{学习率策略}{314}{section*.378}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{并行训练}{316}{section*.381}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.6}推断}{317}{subsection.6.3.6}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{贪婪搜索}{319}{section*.385}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{束搜索}{320}{section*.388}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长度惩罚}{321}{section*.390}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.7}实例-GNMT}{322}{subsection.6.3.7}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.4}Transformer}{323}{section.6.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.1}自注意力模型}{325}{subsection.6.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.2}Transformer架构}{326}{subsection.6.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.3}位置编码}{328}{subsection.6.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.4}基于点乘的注意力机制}{331}{subsection.6.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.5}掩码操作}{333}{subsection.6.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.6}多头注意力}{334}{subsection.6.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.7}残差网络和层正则化}{335}{subsection.6.4.7}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.8}前馈全连接网络子层}{336}{subsection.6.4.8}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.9}训练}{337}{subsection.6.4.9}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.10}推断}{340}{subsection.6.4.10}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.5}序列到序列问题及应用}{340}{section.6.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.1}自动问答}{341}{subsection.6.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.2}自动文摘}{341}{subsection.6.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.3}文言文翻译}{342}{subsection.6.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.4}对联生成}{342}{subsection.6.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.5}古诗生成}{343}{subsection.6.5.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.6}小结及深入阅读}{344}{section.6.6}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {7}神经机器翻译实战 \ \raisebox {0.5mm}{------}\ 参加一次比赛}{347}{chapter.7}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.1}神经机器翻译并不简单}{347}{section.7.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.1}影响神经机器翻译性能的因素}{348}{subsection.7.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.2}搭建神经机器翻译系统的步骤 }{349}{subsection.7.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.3}架构选择 }{350}{subsection.7.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.2}数据处理}{350}{section.7.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.1}分词}{351}{subsection.7.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.2}标准化}{352}{subsection.7.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.3}数据清洗}{353}{subsection.7.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.4}子词切分}{355}{subsection.7.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{大词表和OOV问题}{356}{section*.428}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{子词}{356}{section*.430}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{双字节编码(BPE)}{357}{section*.432}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{其他方法}{360}{section*.435}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.3}建模与训练}{360}{section.7.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.1}正则化}{360}{subsection.7.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{L1/L2正则化}{362}{section*.437}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{标签平滑}{363}{section*.438}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Dropout}{364}{section*.440}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Layer Dropout}{365}{section*.443}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.2}增大模型容量}{366}{subsection.7.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{宽网络}{366}{section*.445}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深网络}{367}{section*.447}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{增大输入层和输出层表示能力}{369}{section*.449}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{大模型的分布式计算}{369}{section*.450}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.3}大批量训练}{369}{subsection.7.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{为什么需要大批量训练}{370}{section*.451}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何构建批次}{371}{section*.454}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.4}推断}{372}{section.7.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.1}推断优化}{372}{subsection.7.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{推断系统的架构}{372}{section*.456}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{自左向右推断 vs 自右向左推断}{373}{section*.458}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{推断加速}{374}{section*.459}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.2}译文长度控制}{381}{subsection.7.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长度惩罚因子}{382}{section*.465}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{译文长度范围约束}{383}{section*.467}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{覆盖度模型}{383}{section*.468}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.3}多模型集成}{384}{subsection.7.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{假设选择}{385}{section*.469}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{局部预测融合}{386}{section*.471}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{译文重组}{387}{section*.473}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.5}进阶技术}{388}{section.7.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.5.1}深层模型}{388}{subsection.7.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Post-Norm vs Pre-Norm}{388}{section*.476}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{层聚合}{391}{section*.479}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深层模型的训练加速}{392}{section*.481}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{渐进式训练}{392}{section*.482}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{分组稠密连接}{392}{section*.484}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{学习率重置策略}{393}{section*.486}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深层模型的鲁棒性训练}{395}{section*.488}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.5.2}单语数据的使用}{396}{subsection.7.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{伪数据}{397}{section*.491}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{预训练}{399}{section*.494}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{联合训练}{401}{section*.497}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.5.3}知识精炼}{401}{subsection.7.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{什么是知识精炼}{402}{section*.499}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{知识精炼的基本方法}{403}{section*.500}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{机器翻译中的知识精炼}{404}{section*.502}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.5.4}双向训练}{406}{subsection.7.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{有监督对偶学习}{406}{section*.504}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{无监督对偶学习}{407}{section*.505}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译中回译}{408}{section*.507}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.6}小结及深入阅读}{408}{section.7.6}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {IV}{附录}}{413}{part.4}
\ttl@stoptoc {default@3}
\ttl@starttoc {default@4}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {A}附录A}{415}{Appendix.1.A}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {A.1}基准数据集}{415}{section.1.A.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {A.2}平行语料}{416}{section.1.A.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {A.3}相关工具}{417}{section.1.A.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {A.3.1}数据预处理工具}{417}{subsection.1.A.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {A.3.2}评价工具}{418}{subsection.1.A.3.2}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {B}附录B}{419}{Appendix.2.B}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {B.1}IBM模型3训练方法}{419}{section.2.B.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {B.2}IBM模型4训练方法}{421}{section.2.B.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {B.3}IBM模型5训练方法}{423}{section.2.B.3}
\contentsfinish
......@@ -53,15 +53,13 @@
\begingroup
\thispagestyle{empty} % Suppress headers and footers on the title page
%\begin{tikzpicture}[remember picture,overlay]
\begin{tikzpicture}[remember picture,overlay]
\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth]{background.pdf}};
\draw (current page.center) node [fill=ocre!30!white,fill opacity=0.6,text opacity=1,inner sep=1cm]{\Huge\centering\bfseries\sffamily\parbox[c][][t]{\paperwidth}{\centering 机器翻译:统计建模与深度学习方法\\[15pt] % Book title
%{\Large 副标题是否需要}\\[20pt] % Subtitle
{\LARGE 肖桐\ \ 朱靖波}}}; % Author name
\node[inner sep=0pt] (background) at (current page.center) {\includegraphics[width=\paperwidth,height=\paperheight]{fig-cover.jpg}};
\end{tikzpicture}
\vfill
\endgroup
\endgroup·
%----------------------------------------------------------------------------------------
% COPYRIGHT PAGE
......@@ -75,9 +73,9 @@
\noindent \textsc{东北大学自然语言处理实验室\ /\ 小牛翻译}\\ % Publisher
\noindent \textsc{\url{http://47.105.50.196/NiuTrans/Toy-MT-Introduction/tree/master/Book}}\\ % URL
\noindent \textsc{\url{https://github.com/NiuTrans/MTBook}}\\ % URL
\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/3.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 4.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/4.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
\noindent \textit{First Edition, April 2020}
......@@ -92,14 +90,15 @@
{\large
\noindent {\color{red} 在此感谢所有为本书做出贡献的人} \\
\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书、周涛、张裕浩、李炎洋、林野、刘晓倩、牛蕊 \\
\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书、周涛、张裕浩、李炎洋、林野、刘晓倩、牛蕊 \\
}
%----------------------------------------------------------------------------------------
% TABLE OF CONTENTS
%----------------------------------------------------------------------------------------
%\usechapterimagefalse % If you don't want to include a chapter image, use this to toggle images off - it can be enabled later with \usechapterimagetrue
\chapterimage{chapter_head_1.pdf} %目录标题的图案
\chapterimage{fig-NEU-1.jpg} %目录标题的图案
\pagestyle{empty} % Disable headers and footers for the following pages
\tableofcontents % 打印目录
\cleardoublepage %保证章节页在奇数页
......@@ -114,9 +113,10 @@
%\include{Chapter1/chapter1}
%\include{Chapter2/chapter2}
%\include{Chapter3/chapter3}
\include{Chapter4/chapter4}
%\include{Chapter4/chapter4}
%\include{Chapter5/chapter5}
%\include{Chapter6/chapter6}
\include{Chapter7/chapter7}
%\include{ChapterAppend/chapterappend}
......@@ -124,6 +124,7 @@
%----------------------------------------------------------------------------------------
% BIBLIOGRAPHY
%----------------------------------------------------------------------------------------
\chapterimage{fig-NEU-10.jpg} %目录标题的图案
\cleardoublepage % Make sure the index starts on an odd (right side) page
\printbibliography
......@@ -132,7 +133,7 @@
%----------------------------------------------------------------------------------------
% INDEX
%----------------------------------------------------------------------------------------
\chapterimage{fig-NEU-10.jpg} %目录标题的图案
\cleardoublepage % Make sure the index starts on an odd (right side) page
%\phantomsection
%\setlength{\columnsep}{0.75cm} % Space between the 2 columns of the index
......
......@@ -75,9 +75,9 @@
\noindent \textsc{东北大学自然语言处理实验室\ /\ 小牛翻译}\\ % Publisher
\noindent \textsc{\url{http://47.105.50.196/NiuTrans/Toy-MT-Introduction/tree/master/Book}}\\ % URL
\noindent \textsc{\url{https://github.com/NiuTrans/MTBook}}\\ % URL
\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 3.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/3.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
\noindent {\red{Licensed under the Creative Commons Attribution-NonCommercial 4.0 Unported License (the ``License''). You may not use this file except in compliance with the License. You may obtain a copy of the License at \url{http://creativecommons.org/licenses/by-nc/4.0}. Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an \textsc{``as is'' basis, without warranties or conditions of any kind}, either express or implied. See the License for the specific language governing permissions and limitations under the License.}}\\ % License information, replace this with your own license (if any)
\noindent \textit{First Edition, April 2020}
......@@ -92,7 +92,7 @@
{\large
\noindent {\color{red} 在此感谢所有为本书做出贡献的人} \\
\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书、周涛、张裕浩、李炎洋,刘晓倩、牛蕊 \\
\noindent 曹润柘、曾信、孟霞、单韦乔、姜雨帆、王子扬、刘辉、许诺、李北、刘继强、张哲旸、周书、周涛、张裕浩、李炎洋,刘晓倩、牛蕊 \\
}
......
......@@ -2,19 +2,20 @@
% !TEX encoding = UTF-8 Unicode
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% This file was modified on top of
% The Legrand Orange Book
% Structural Definitions File
% Version 2.1 (26/09/2018)
%
% Original author:
% Mathias Legrand (legrand.mathias@gmail.com) with modifications by:
% Vel (vel@latextemplates.com)
%
% This file was downloaded from:
% http://www.LaTeXTemplates.com
% Current Version is maintained by
% Tong Xiao (xiaotong@mail.neu.edu.cn)
% Runzhe Cao (854581319@qq.com)
%
% License:
% CC BY-NC-SA 3.0 (http://creativecommons.org/licenses/by-nc-sa/3.0/)
% License of This File:
% CC BY-NC-SA 4.0 (http://creativecommons.org/licenses/by-nc-sa/4.0/)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......@@ -47,6 +48,7 @@
\geometry{
paper=b5paper, % Paper size, change to letterpaper for US letter size
%papersize={185mm,260mm}, % specify paper size by (width,height)
top=2cm, % Top margin
bottom=1.5cm, % Bottom margin
left=1.8cm, % Left margin
......@@ -114,7 +116,7 @@
{\addvspace{3pt}\sffamily\bfseries} % Spacing and font options for sections
{\contentslabel[\thecontentslabel]{1.25cm}} % Formatting of numbered sections of this type
{} % Formatting of numberless sections of this type
{ \titlerule*[.5pc]{.}\;\thecontentspage}%
{\titlerule*[.5pc]{.}\;\thecontentspage}%
%{\hfill\color{black}\thecontentspage} % Formatting of the filler to the right of the heading and the page number
% Subsection text styling
\titlecontents{subsection}
......@@ -372,15 +374,15 @@ innerbottommargin=5pt]{cBox}
\renewcommand{\section}{\@startsection{section}{1}{\z@}
{-4ex \@plus -1ex \@minus -.4ex}
{1ex \@plus.2ex }
{\normalfont\large\sffamily\bfseries}}
{\color{ublue}\normalfont\Large\sffamily\bfseries}}
\renewcommand{\subsection}{\@startsection {subsection}{2}{\z@}
{-3ex \@plus -0.1ex \@minus -.4ex}
{0.5ex \@plus.2ex }
{\normalfont\sffamily\bfseries}}
{\normalfont\large\sffamily\bfseries}}
\renewcommand{\subsubsection}{\@startsection {subsubsection}{3}{\z@}
{-3ex \@plus -0.1ex \@minus -.4ex}
{.4ex \@plus.2ex }
{\normalfont\small\sffamily\bfseries}}
{\normalfont\normalsize\sffamily\bfseries}}
\renewcommand\paragraph{\@startsection{paragraph}{4}{\z@}
{-2ex \@plus-.2ex \@minus .2ex}
{.1ex}
......@@ -537,6 +539,13 @@ addtohook={%
\fi
}
}
%----------------------------------------------------------------------------------------
% NEW PAGE FOR SUBSECTION
%----------------------------------------------------------------------------------------
%\newcommand{\sectionnewpage}{\newpage}
\newcommand{\sectionnewpage}{}
%----------------------------------------------------------------------------------------
% Chapter 3
%----------------------------------------------------------------------------------------
......@@ -547,7 +556,7 @@ addtohook={%
\usetikzlibrary{mindmap,backgrounds} % mind map
\usepackage{type1cm}%设置公式字体
\usepackage{caption}%设置图片标题字体大小
\captionsetup{font={small}}
\captionsetup{font={footnotesize}}
\usepackage{pstricks}
\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}
......@@ -615,10 +624,29 @@ addtohook={%
\definecolor{xtgreen}{rgb}{0.914,0.945,0.902}
\definecolor{lightgray}{gray}{0.85}
%%%%%%%%%%%%appendix-------------------------------
\makeatletter
\def\UrlAlphabet{%
\do\a\do\b\do\c\do\d\do\e\do\f\do\g\do\h\do\i\do\j%
\do\k\do\l\do\m\do\n\do\o\do\p\do\q\do\r\do\s\do\t%
\do\u\do\v\do\w\do\x\do\y\do\z\do\A\do\B\do\C\do\D%
\do\E\do\F\do\G\do\H\do\I\do\J\do\K\do\L\do\M\do\N%
\do\O\do\P\do\Q\do\R\do\S\do\T\do\U\do\V\do\W\do\X%
\do\Y\do\Z}
\def\UrlDigits{\do\1\do\2\do\3\do\4\do\5\do\6\do\7\do\8\do\9\do\0}
\g@addto@macro{\UrlBreaks}{\UrlOrds}%特殊符号
\g@addto@macro{\UrlBreaks}{\UrlAlphabet}%26个字母表
\g@addto@macro{\UrlBreaks}{\UrlDigits}%10个阿拉伯数字
\makeatother
%上述设置的作用是URL自动换行
%%%%%%%%%%%chapter 7---------------------------------------
%\definecolor{myblack}{rgb}{0.15,0.15,0.15}
\definecolor{myblack}{rgb}{0.2,0.2,205.2}
\newlength{\hseg}
\newlength{\wnode}
\newlength{\hnode}
\newlength{\wseg}
\usepackage{collcell}
\usepackage[mathscr]{euscript}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论