Commit 9dc1938f by 曹润柘

合并分支 'caorunzhe' 到 'master'

Caorunzhe

查看合并请求 !15
parents ed8e172a b34f8ac1
%%%------------------------------------------------------------------------------------------------------------
%%% 短语系统的架构
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{datanode} = [minimum width=5em,minimum height=1.7em,fill=red!20,rounded corners=0.3em];
\tikzstyle{modelnode} = [minimum width=5em,minimum height=1.7em,fill=blue!20,rounded corners=0.3em];
\tikzstyle{decodingnode} = [minimum width=5em,minimum height=1.7em,fill=green!20,rounded corners=0.3em];
\node [datanode,anchor=north west] (s1) at (0,0) {{ \small{语言1}}};
\node [datanode,anchor=north] (s2) at ([yshift=-4.5em]s1.south) {{ \small{语言3}}};
\node [datanode,anchor=west] (s3) at ([xshift=4.5em]s1.east) {{ \small{语言2}}};
\node [datanode,anchor=north] (s4) at ([yshift=-4.5em]s3.south) {{ \small{语言4}}};
\node [circle,anchor=north west,inner sep=2pt,fill=blue!20] (m1) at ([xshift=0.8em,yshift=-0.5em]s1.south east) {{ \small{中间语言}}};
\draw [<->,very thick] (s1.south) -- (m1.west);
\draw [<->,very thick] (s2.north) -- (m1.west);
\draw [<->,very thick] (s3.south) -- (m1.east);
\draw [<->,very thick] (s4.north) -- (m1.east);
\end{scope}
\begin{scope}[xshift=16em]
\tikzstyle{datanode} = [minimum width=5em,minimum height=1.7em,fill=red!20,rounded corners=0.3em];
\tikzstyle{modelnode} = [minimum width=5em,minimum height=1.7em,fill=blue!20,rounded corners=0.3em];
\tikzstyle{decodingnode} = [minimum width=5em,minimum height=1.7em,fill=green!20,rounded corners=0.3em];
\node [datanode,anchor=north west] (s1) at (0,0) {{ \small{语言1}}};
\node [datanode,anchor=north] (s2) at ([yshift=-4.5em]s1.south) {{ \small{语言3}}};
\node [datanode,anchor=west] (s3) at ([xshift=4.5em]s1.east) {{ \small{语言2}}};
\node [datanode,anchor=north] (s4) at ([yshift=-4.5em]s3.south) {{ \small{语言4}}};
\draw [<->,very thick] (s1.south) -- (s2.north);
\draw [<->,very thick] (s1.east) -- (s3.west);
\draw [<->,very thick] (s3.south) -- (s4.north);
\draw [<->,very thick] (s2.east) -- (s4.west);
\draw [<->,very thick] (s1.south east) -- (s4.north west);
\draw [<->,very thick] (s2.north east) -- (s3.south west);
\end{scope}
\end{tikzpicture}
\end{center}
\ No newline at end of file
%------------------------------------------------------------------------------------------------------------
%%% 句法树(层次短语)
\begin{tikzpicture}
{\small
\begin{scope}[sibling distance=10pt, level distance = 20pt]
{\scriptsize
\Tree[.\node(n1){\textbf{zj}};
[.\node(n2){\textbf{dj}};
[.\node(n3){\textbf{np}}; \node(cw1){}; ]
[.\node(n4){\textbf{vp}};
[. \node(cw1){pp};
[. \node(cw2){}; ]
[. \node(cw3){np};
[. \node(cw4){mp};
[. \node(p1){}; ]
[. \node(p2){}; ]
]
[. \node(cw5){np}; \node(p3){}; ]
]
]
[. \node(cw6){vp};
[. \node(cw7){}; ]
[. \node(cw8){pp};
[. \node(cw9){}; ]
[. \node(cw10){sp};
[. \node(cw11){}; ]
[. \node(cw12){}; ]
]
]
]
]
]
[.\node(n5){\textbf{}}; ]
]
}
\end{scope}
}
\end{tikzpicture}
......@@ -35,9 +35,45 @@
\node[anchor=north west] (original4-3) at ([yshift=0.3em]original4-2.south west) {\scriptsize{悲哀,让我跟你去吧!''``哦,……爱,我实在太悲哀了,想自己一个人呆一会儿!''悲哀答}};
\node[anchor=north west] (original4-4) at ([yshift=0.3em]original4-3.south west) {\scriptsize{道。快乐走近爱的身边,但是她太快乐了,竟然没有听见爱在叫她!}};
%机器翻译--------------
\node [anchor=north west] (mt4) at ([xshift=-3.5em,yshift=-0.3em]original4-4.south west) {\scriptsize{机器翻译:At this time, Richness {\color{red}\underline{passed by}} in a big ship. Love said, ``Rich, can you take me away?'' Richness}};
\node [anchor=north west] (mt4-1) at ([xshift=3.5em,yshift=0.4em]mt4.south west) {\scriptsize{replied, ``No, {\color{red}\underline{there are many treasures}} of gold and silver in my ship,and there is no place for you.''}};
\node [anchor=north west] (mt4-2) at ([xshift=0em,yshift=0.3em]mt4-1.south west) {\scriptsize{ Love saw vanity in a magnificent boat and said, ``Vanity, help me!'' ``I can't help you. You are {\color{red}\underline{soak}}}};
\node [anchor=north west] (mt4-3) at ([yshift=0.4em]mt4-2.south west) {\scriptsize{{\color{red}\underline{-ed to the skin}} and will damage my beautiful boat.'' When sorrow came, love asked him for help: ``}};
\node [anchor=north west] (mt4-4) at ([yshift=0.4em]mt4-3.south west) {\scriptsize{sorrow, let me go with you!'' ``Oh,...love, I am so sad that I want to be alone for a while!'' Sadly rep}};
\node [anchor=north west] (mt4-5) at ([yshift=0.4em]mt4-4.south west) {\scriptsize{-lied. Happiness {\color{red}\underline{approached}} love, but she was too happy to hear love calling her!}};
%人工翻译---------------
\node [anchor=north west] (ht4) at ([xshift=-3.5em,yshift=0.3em]mt4-5.south west) {\scriptsize{人工翻译:At that moment, WEALTH {\color{red}\underline{was passing by}} in a big boat. Love said,``WEALTH, can you take me}};
\node [anchor=north west] (ht4-1) at ([xshift=3.5em,yshift=0.4em]ht4.south west) {\scriptsize{with you?'' WEALTH answered, ``no, {\color{red}\underline{there is a lot}} of gold and silver in my boat. There is no place}};
\node [anchor=north west] (ht4-2) at ([yshift=0.4em]ht4-1.south west) {\scriptsize{for you.'' Love saw VANITY in a beautiful boat and said, ``VANITY, help me!'' ``I can't help you.}};
\node [anchor=north west] (ht4-3) at ([yshift=0.4em]ht4-2.south west) {\scriptsize{You are {\color{red}\underline{all wet,}} and will break my pretty boat.'' Then SADNESS came. Love asked for help,``SAD}};
\node [anchor=north west] (ht4-4) at ([yshift=0.4em]ht4-3.south west) {\scriptsize{-NESS, let me go with you!'' ``Oh,...LOVE, I am so sad that I want to be alone for a while!'' ``Repli}};
\node [anchor=north west] (ht4-5) at ([yshift=0.4em]ht4-4.south west) {\scriptsize{-ed SADNESS. JOY {\color{red}\underline{came close to }} love, but she was so happy that she did not hear him call her!}};
%第三段--------------------------------
\node[anchor=north west] (original8) at ([xshift=-3.5em,yshift=-0.3em]ht4-5.south west) {\scriptsize{\qquad 文:突然,一个声音传来:``过来,爱,我带你走。''这是位长者。爱大喜过望,竟忘了问他他}};
\node[anchor=north west] (original8-1) at ([xshift=3.5em,yshift=0.3em]original8.south west) {\scriptsize{的名字。登上陆地后,长者独自走开了。爱对长者感激不尽,问另一位长者知识:``帮我的}};
\node[anchor=north west] (original8-2) at ([yshift=0.3em]original8-1.south west) {\scriptsize{那个人是谁?''``他是时间。''知识老人回答。``时间?''爱问道,``他为什么要帮我?''知识老}};
\node[anchor=north west] (original8-3) at ([yshift=0.3em]original8-2.south west) {\scriptsize{人笑道:``因为只有时间才能理解爱有多么伟大。''}};
%机器翻译--------------
\node [anchor=north west] (mt8) at ([xshift=-3.5em,yshift=0.4em]original8-3.south west) {\scriptsize{机器翻译:Suddenly, a voice {\color{red}\underline{came:}} ``Come here, love, I'll take you away.'' This is an elder. Love was {\color{red}\underline{overjoy}}}};
\node [anchor=north west] (mt8-1) at ([xshift=3.5em,yshift=0.4em]mt8.south west) {\scriptsize{{\color{red}\underline{-ed and}} forgot to ask his name. After landing on land, the elder walked away alone.Love was very}};
\node [anchor=north west] (mt8-2) at ([yshift=0.4em]mt8-1.south west) {\scriptsize{grateful to the elder and asked another elder knowledge, {\color{red}\underline{``Who is the person who helped me?''}} ``He}};
\node [anchor=north west] (mt8-3) at ([yshift=0.4em]mt8-2.south west) {\scriptsize{is time.'' The {\color{red}\underline{old intellectual}} replied. ``Time?'' Love asked,``Why did he help me?'' The old intellec}};
\node [anchor=north west] (mt8-4) at ([yshift=0.4em]mt8-3.south west) {\scriptsize{-tual laughed, ``Because only time canunderstand how great love is.''}};
%人工翻译---------------
\node [anchor=north west] (ht8) at ([xshift=-3.5em,yshift=0.4em]mt8-4.south west) {\scriptsize{人工翻译:Suddenly, a voice {\color{red}\underline{said,}} ``come, LOVE, I'll take you.'' This is an elder. LOVE was {\color{red}\underline{so over that}} she}};
\node [anchor=north west] (ht8-1) at ([xshift=3.5em,yshift=0.4em]ht8.south west) {\scriptsize{forgot to ask his name. After landing on land, the elder walked away}};
\node [anchor=north west] (ht8-2) at ([yshift=0.4em]ht8-1.south west) {\scriptsize{alone.LOVE was so grateful to the elder that she asked KNOWLEDGE, another elder, {\color{red}\underline{``who help}}}};
\node [anchor=north west] (ht8-3) at ([yshift=0.4em]ht8-2.south west) {\scriptsize{{\color{red}\underline{-ed me?''}} ``He is TIME.'' The {\color{red}\underline{old man}} replied. ``TIME?'' LOVE asked. ``why did he help me?'' The}};
\node [anchor=north west] (ht8-4) at ([yshift=0.4em]ht8-3.south west) {\scriptsize{old man smiled and said, ``Because only time can understand how great love is.''}};
\begin{pgfonlayer}{background}
{
\node[rectangle,draw=ublue, inner sep=0mm] [fit =(original1)(ht1)(mt1)(original4-4)] {};
\node[rectangle,draw=ublue, inner sep=0mm] [fit =(original1)(ht1)(mt1)(original4-4)(ht4-2)(ht8-4)(ht8)] {};
}
\end{pgfonlayer}
......
......@@ -4,23 +4,9 @@
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}
%机器翻译--------------
\node [pos=0.4,left,xshift=-0.4em,yshift=2.0em] (mt4) {\scriptsize{机器翻译:At this time, Richness {\color{red}\underline{passed by}} in a big ship. Love said, ``Rich, can you take me away?'' Richness}};
\node [anchor=north west] (mt4-1) at ([xshift=3.5em,yshift=0.4em]mt4.south west) {\scriptsize{replied, ``No, {\color{red}\underline{there are many treasures}} of gold and silver in my ship,and there is no place for you.''}};
\node [anchor=north west] (mt4-2) at ([xshift=0em,yshift=0.3em]mt4-1.south west) {\scriptsize{ Love saw vanity in a magnificent boat and said, ``Vanity, help me!'' ``I can't help you. You are {\color{red}\underline{soak}}}};
\node [anchor=north west] (mt4-3) at ([yshift=0.4em]mt4-2.south west) {\scriptsize{{\color{red}\underline{-ed to the skin}} and will damage my beautiful boat.'' When sorrow came, love asked him for help: ``}};
\node [anchor=north west] (mt4-4) at ([yshift=0.4em]mt4-3.south west) {\scriptsize{sorrow, let me go with you!'' ``Oh,...love, I am so sad that I want to be alone for a while!'' Sadly rep}};
\node [anchor=north west] (mt4-5) at ([yshift=0.4em]mt4-4.south west) {\scriptsize{-lied. Happiness {\color{red}\underline{approached}} love, but she was too happy to hear love calling her!}};
%人工翻译---------------
\node [anchor=north west] (ht4) at ([xshift=-3.5em,yshift=0.3em]mt4-5.south west) {\scriptsize{人工翻译:At that moment, WEALTH {\color{red}\underline{was passing by}} in a big boat. Love said,``WEALTH, can you take me}};
\node [anchor=north west] (ht4-1) at ([xshift=3.5em,yshift=0.4em]ht4.south west) {\scriptsize{with you?'' WEALTH answered, ``no, {\color{red}\underline{there is a lot}} of gold and silver in my boat. There is no place}};
\node [anchor=north west] (ht4-2) at ([yshift=0.4em]ht4-1.south west) {\scriptsize{for you.'' Love saw VANITY in a beautiful boat and said, ``VANITY, help me!'' ``I can't help you.}};
\node [anchor=north west] (ht4-3) at ([yshift=0.4em]ht4-2.south west) {\scriptsize{You are {\color{red}\underline{all wet,}} and will break my pretty boat.'' Then SADNESS came. Love asked for help,``SAD}};
\node [pos=0.4,left,xshift=-0.4em,yshift=2.0em,opacity=0] (mt4) {\scriptsize{机器翻译:At this time, Richness {\color{red}\underline{passed by}} in a big ship. Love said, ``Rich, can you take me away?'' Richness}};
\node [anchor=north west] (ht4-3) at ([xshift=3.5em,yshift=1.3em]mt4.south west) {\scriptsize{You are {\color{red}\underline{all wet,}} and will break my pretty boat.'' Then SADNESS came. Love asked for help,``SAD}};
\node [anchor=north west] (ht4-4) at ([yshift=0.4em]ht4-3.south west) {\scriptsize{-NESS, let me go with you!'' ``Oh,...LOVE, I am so sad that I want to be alone for a while!'' ``Repli}};
\node [anchor=north west] (ht4-5) at ([yshift=0.4em]ht4-4.south west) {\scriptsize{-ed SADNESS. JOY {\color{red}\underline{came close to }} love, but she was so happy that she did not hear him call her!}};
......@@ -50,7 +36,7 @@
\begin{pgfonlayer}{background}
{
\node[rectangle,draw=ublue, inner sep=0mm] [fit = (mt4)(ht4)(ht8-4)] {};
\node[rectangle,draw=ublue, inner sep=0mm] [fit = (ht4-3)(original8)(ht8-4)] {};
}
\end{pgfonlayer}
......
%%%------------------------------------------------------------------------------------------------------------
%%% 短语系统的架构
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{node1} = [minimum width=7em,minimum height=1.7em,fill=red!20,rounded corners=0.3em];
\tikzstyle{node2} = [minimum width=7em,minimum height=2.5em,fill=blue!20,rounded corners=0.3em];
\tikzstyle{node3} = [minimum width=7em,minimum height=2.5em,fill=green!20,rounded corners=0.3em];
\tikzstyle{node4} = [minimum width=7em,minimum height=1.7em,fill=orange!20,rounded corners=0.3em];
\tikzstyle{node5} = [minimum width=4.5em,minimum height=1.7em,dashed];
\node [node1,anchor=south west] (s1) at (0,0) {{ \small{源语言词串}}};
\node [node2,anchor=south] (s2) at ([xshift=2.5em,yshift=2em]s1.north) {};
\node [anchor=north,minimum height=1.2em] (s21) at (s2.north) {{ \small{源语言}}};
\node [anchor=south,minimum height=1.2em] (s22) at (s2.south) {{ \small{句法结构}}};
\node [node3,anchor=south] (s3) at ([xshift=2.5em,yshift=2em]s2.north) {};
\node [anchor=north,minimum height=1.2em] (s31) at (s3.north) {{ \small{源语言}}};
\node [anchor=south,minimum height=1.2em] (s32) at (s3.south) {{ \small{语义结构}}};
\node [node4,anchor=south] (s4) at ([xshift=6em,yshift=2em]s3.north) {{ \small{中间语言(知识表示)}}};
\node [node3,anchor=north] (t1) at ([xshift=6em,yshift=-2em]s4.south) {};
\node [anchor=north,minimum height=1.2em] (t11) at (t1.north) {{ \small{目标语言}}};
\node [anchor=south,minimum height=1.2em] (t12) at (t1.south) {{ \small{语义结构}}};
\node [node2,anchor=north] (t2) at ([xshift=2.5em,yshift=-2em]t1.south) {};
\node [anchor=north,minimum height=1.2em] (t21) at (t2.north) {{ \small{目标语言}}};
\node [anchor=south,minimum height=1.2em] (t22) at (t2.south) {{ \small{句法结构}}};
\node [node1,anchor=north] (t3) at ([xshift=2.5em,yshift=-2em]t2.south) {{ \small{目标语言词串}}};
\node [anchor=north] (l1) at ([yshift=-2em]s4.south) {{ \small{语义\ 转换}}};
\node [anchor=north] (l2) at ([yshift=-3em]l1.south) {{ \small{句法\ 转换}}};
\node [anchor=north] (l3) at ([yshift=-2.5em]l2.south) {{ \small{词汇\ 转换}}};
\node [node5,anchor=north,draw] (st1) at ([xshift=-3em,yshift=-0.5em]l3.south) {{ \small{分析部分}}};
\node [node5,anchor=north,draw] (st2) at ([xshift=3em,yshift=-0.5em]l3.south) {{ \small{生成部分}}};
\draw [->,thick] (s1.north) -- ([xshift=-0.6em]s2.south);
\draw [->,thick] ([xshift=0.6em]s2.north) -- (s3.south);
\draw [->,thick] ([xshift=1.2em]s3.north) -- ([xshift=-2.7em]s4.south);
\draw [->,thick] ([xshift=2.7em]s4.south) -- ([xshift=-1.2em]t1.north);
\draw [->,thick] (t1.south) -- ([xshift=-0.6em]t2.north);
\draw [->,thick] ([xshift=0.6em]t2.south) -- (t3.north);
\draw [->,thick] (s3.east) -- (t1.west);
\draw [->,thick] (s2.east) -- (t2.west);
\draw [->,thick] (s1.east) -- (t3.west);
\draw [-] ([xshift=0.1em]s4.south) -- ([xshift=0.1em,yshift=-14em]s4.south);
\end{scope}
\end{tikzpicture}
\end{center}
\ No newline at end of file
%%%------------------------------------------------------------------------------------------------------------
%%% 短语系统的架构
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{datanode} = [minimum width=7em,minimum height=1.7em,fill=red!20,rounded corners=0.3em];
\tikzstyle{modelnode} = [minimum width=7em,minimum height=1.7em,fill=blue!20,rounded corners=0.3em];
\tikzstyle{decodingnode} = [minimum width=7em,minimum height=1.7em,fill=green!20,rounded corners=0.3em];
\node [datanode,anchor=north west,minimum height=1.7em,minimum width=8em] (s1) at (0,0) {{ \small{源文句子}}};
\node [modelnode,anchor=north,minimum height=1.7em,minimum width=8em] (s2) at ([yshift=-1.5em]s1.south) {{ \small{源语词法分析}}};
\node [datanode,anchor=north,minimum height=1.7em,minimum width=8em] (s3) at ([yshift=-1.5em]s2.south) {{ \small{源文词串}}};
\node [modelnode,anchor=north,minimum height=1.7em,minimum width=8em] (s4) at ([yshift=-1.5em]s3.south) {{ \small{源语句法分析}}};
\node [datanode,anchor=north,minimum height=1.7em,minimum width=8em] (s5) at ([yshift=-1.5em]s4.south) {{ \small{源文结构}}};
\node [datanode,anchor=west,minimum height=1.7em,minimum width=8em] (t1) at ([xshift=14em]s1.east) {{ \small{译文句子}}};
\node [modelnode,anchor=north,minimum height=1.7em,minimum width=8em] (t2) at ([yshift=-1.5em]t1.south) {{ \small{目标语词法生成}}};
\node [datanode,anchor=north,minimum height=1.7em,minimum width=8em] (t3) at ([yshift=-1.5em]t2.south) {{ \small{译文词串}}};
\node [modelnode,anchor=north,minimum height=1.7em,minimum width=8em] (t4) at ([yshift=-1.5em]t3.south) {{ \small{目标语句法生成}}};
\node [datanode,anchor=north,minimum height=1.7em,minimum width=8em] (t5) at ([yshift=-1.5em]t4.south) {{ \small{译文结构}}};
\node [decodingnode,anchor=west,minimum height=1.7em,minimum width=8em] (st1) at ([xshift=2.5em,yshift=0.85em]s5.east) {{ \small{源语-目标语词汇转换}}};
\node [decodingnode,anchor=north,minimum height=1.7em,minimum width=8em] (st2) at ([yshift=0.05em]st1.south) {{ \small{源语-目标语结构转换}}};
\draw [->,very thick] (s1.south) -- (s2.north);
\draw [->,very thick] (s2.south) -- (s3.north);
\draw [->,very thick] (s3.south) -- (s4.north);
\draw [->,very thick] (s4.south) -- (s5.north);
\draw [->,very thick] (s5.east) -- (st1.south west);
\draw [->,very thick] (st1.south east) -- (t5.west);
\draw [->,very thick] (t5.north) -- (t4.south);
\draw [->,very thick] (t4.north) -- (t3.south);
\draw [->,very thick] (t3.north) -- (t2.south);
\draw [->,very thick] (t2.north) -- (t1.south);
\end{scope}
\end{tikzpicture}
\end{center}
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-10.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 10
%----------------------------------------------------------------------------------------
\chapter{基于循环神经网络的模型}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-2.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 11
%----------------------------------------------------------------------------------------
\chapter{基于自注意力机制的模型}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-3.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 12
%----------------------------------------------------------------------------------------
\chapter{神经机器翻译模型训练}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-4.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 13
%----------------------------------------------------------------------------------------
\chapter{神经机器翻译模型推断}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-5.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 14
%----------------------------------------------------------------------------------------
\chapter{低资源神经机器翻译}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-6.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 15
%----------------------------------------------------------------------------------------
\chapter{面向神经机器翻译的网络结构设计}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-7.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 16
%----------------------------------------------------------------------------------------
\chapter{基于多信息源方法}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-8.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 17
%----------------------------------------------------------------------------------------
\chapter{神经机器翻译实践}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}%画图中的属性如xshift应该是通用的,前面的关键字如xlabel规定了修改的部分
\begin{axis}[
width=10cm, height=4.5cm,
symbolic x coords={未抽取词,do,want,what,am,people,look},%自定义x坐标
xtick=data,%自定义x坐标
ytick={0,0.05,0.1,0.15,0.2,0.25},
xlabel={低概率词汇},
ylabel={词汇概率},
legend pos=outer north east,%图标位置
xlabel style={align=right,xshift=5.3cm,yshift=0.8cm,font=\footnotesize},
ylabel style={rotate=-90,yshift=2cm,xshift=1cm,font=\footnotesize},
y tick style={opacity=0},%隐藏y轴刻度线
x tick style={opacity=0},%隐藏x轴刻度线
x tick label style={anchor=base,font=\footnotesize,yshift=-0.5cm},
y tick label style={font=\footnotesize,/pgf/number format/.cd,fixed,precision=2},%y轴精度,不用科学表示
y axis line style={opacity=0},%隐藏y轴
tick align=inside,%原本的横行线
ymajorgrids,%显示横行网格线
axis x line*=bottom,%显示x轴坐标汉字(应该是对齐)
major grid style={dotted,draw=ublue},%横行线颜色
axis on top,%网格线位于顶层
legend style={anchor=north west,font=\footnotesize},%图标格式
ymin=0,
ymax=0.25]
\addplot [ybar,bar shift=-2mm,bar width=4mm,fill=blue!40,draw=blue!40,area legend] coordinates{(未抽取词,0) (do,0.05) (want,0.05) (what,0.05) (am,0.1) (people,0.15) (look,0.2)};%area legend图例显示长方形颜色
\addplot [ybar,bar shift=2mm,bar width=4mm,fill=red!40,draw=red!40,area legend] coordinates{(未抽取词,0.03) (do,0.062) (want,0.062) (what,0.062) (am,0.09) (people,0.122) (look,0.138)};
\legend{未平滑,平滑后}%右上图例
\end{axis}
\end{tikzpicture}
%---------------------------------------------------------------------
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}
\begin{scope}[scale=0.5]
\draw [-,very thick] (-2,-1) cos(0,1.5) sin (2,4) cos(4,1.5) sin(6,-1) ;
\draw [-latex,thick] (-2.0,-1.2) -- (10,-1.2);%横轴
\draw [-latex,thick] (-0.5,-2.0) -- (-0.5,6);%纵轴
\draw [-,dashed] (2,4) -- (2,-1.2); %%图中纵轴虚线
\draw [-,dashed] (-0.5,4) -- (2,4); %%图中横轴虚线
\node [anchor=north] at (-0.8,-1.2) {0};
\node [anchor=north] at (2,-1.2) {$\mu$};
\node [anchor=north] at (-1.4,4.75) {$\frac{1}{\sqrt{2\pi}\sigma}$};
\node [anchor=north] at (-1.2,6.2) {\scriptsize{$f(x)$}};
\node [anchor=north] at (9.6,-1.2) {\footnotesize{$x$}};
\end{scope}
\begin{scope}[scale=0.5,xshift=35.0em]
\draw [-,very thick] (-2,-1) cos(3,1.5) sin(8,4) ;
\draw [-latex,thick] (-2.0,-1.2) -- (10,-1.2);%横轴
\draw [-latex,thick] (-0.5,-2.0) -- (-0.5,6.0);%纵轴
\draw [-,dashed] (3,1.5) -- (3,-1.2); %%图中纵轴虚线
\draw [-,dashed] (-0.5,1.5) -- (3,1.5); %%图中横轴虚线
\draw [-,dashed] (-0.5,4.2) -- (8.5,4.2); %%图中横轴虚线
\node [anchor=north] at (-0.8,-1.2) {0};
\node [anchor=north] at (3,-1.2) {$\mu$};
\node [anchor=north] at (-1.1,2.0) {0.5};
\node [anchor=north] at (-0.8,4.7) {1};
\node [anchor=north] at (-1.2,6.2) {\scriptsize{$F(x)$}};
\node [anchor=north] at (9.6,-1.2) {\footnotesize{$x$}};
\end{scope}
\end{tikzpicture}
%---------------------------------------------------------------------
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}
\node [anchor=north west](num1) at (0,0) {\large{A}};
\node [anchor=north west](num2) at ([xshift=5.8em,yshift=1.44em]num1.south west) {\large{B}};
\node [anchor=north west](num3) at ([xshift=5.8em,yshift=1.44em]num2.south west) {\large{C}};
\node [anchor=north west](num4) at ([xshift=5.8em,yshift=1.44em]num3.south west) {\large{D}};
\node [anchor=north west](num5) at ([xshift=0.04em,yshift=-2.5em]num3.south west) {\large{E}};
\draw [<-,very thick,black] (num1.east)--(num2.west);
\draw [->,very thick,black] (num2.east)--(num3.west);
\draw [<-,very thick,black] (num3.east)--(num4.west);
\draw [->,very thick,black] (num3.south)--(num5.north);
\end{tikzpicture}
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}
\begin{scope}[scale=1.0]
{
\node [anchor=north west,minimum width=7em, minimum height=2.5em,fill=blue!30](num1) at (0,0) {$A$\quad \quad \quad \quad };
\node [anchor=west,minimum width=7em, minimum height=5em,fill=ugreen!30](num2) at ([xshift=-3em]num1.east) {\quad \quad $B$};
\node [anchor=west,minimum width=3em, minimum height=2.5em,fill=yellow!30](part1) at (num2.west) {$C$};
\draw [-,thick] (num1.north west) -- (num1.north east) -- (num1.south east) -- (num1.south west) -- (num1.north west);
\draw [-,very thick,dotted] (num2.north west) -- (num2.north east) -- (num2.south east) -- (num2.south west) -- (num2.north west);
}
\end{scope}
\end{tikzpicture}
%---------------------------------------------------------------------
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{axis}[
width=12cm,
height=4cm,
xlabel={{$\textrm{P}(x)$}},
ylabel={{$\textrm{I}(x)$}},
ylabel style={yshift=-1.5em,font=\footnotesize},
xlabel style={yshift=0.3em,font=\footnotesize},
xtick={0,0.2,...,1.0},
ytick={0,8},
y tick style={opacity=0},
x tick style={opacity=0},
domain=0.01:1,
enlarge x limits=true,
enlarge y limits={upper},
legend style={draw=none},
xmin=0,
xmax=1,
ymin=0,
ymax=8,
xticklabel style={font=\small}, %坐标轴字体大小
yticklabel style={font=\small}
]
\addplot[draw=ublue,samples=100,thick] {-log2(x)};
\legend{\footnotesize{$\textrm{I}(x) = -\log \textrm{P}(x)$}}
\end{axis}
\end{tikzpicture}
%---------------------------------------------------------------------
\ No newline at end of file
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\definecolor{ugreen}{rgb}{0,0.5,0}
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}[scale=0.6]
\begin{scope}
{\footnotesize
\foreach \i in {1,...,5}{
\node [draw,thick,minimum size=10pt] at (\i,0) {1};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{1}) = 5/30$};
\end{scope}
\begin{scope}[yshift=-2.5em]
{\footnotesize
\foreach \i in {1,...,4}{
\node [draw,thick,minimum size=10pt] at (\i,0) {{\color{red} 2}};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{2}) = 4/30$};
\end{scope}
\begin{scope}[yshift=-5.0em]
{\footnotesize
\foreach \i in {1,...,6}{
\node [draw,thick,minimum size=10pt] at (\i,0) {{\color{ublue} 3}};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{3}) = 6/30$};
\end{scope}
\begin{scope}[yshift=-7.5em]
{\footnotesize
\foreach \i in {1,...,12}{
\node [draw,thick,minimum size=10pt] at (\i,0) {{\color{ugreen} 4}};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{4}) = 12/30$};
\end{scope}
\begin{scope}[yshift=-10.0em]
{\footnotesize
\foreach \i in {1,...,2}{
\node [draw,thick,minimum size=10pt] at (\i,0) {{\color{purple} 5}};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{5}) = 2/30$};
\end{scope}
\begin{scope}[yshift=-12.5em]
{\footnotesize
\foreach \i in {1,...,1}{
\node [draw,thick,minimum size=10pt] at (\i,0) {{\color{orange} 6}};
}
}
\node [anchor=west] at (33em,0) {$\textrm{P}(\text{6}) = 1/30$};
\end{scope}
\end{tikzpicture}
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\definecolor{ugreen}{rgb}{0,0.5,0}
%%% outline
%-------------------------------------------------------------------------
\begin{tikzpicture}
{
\begin{scope}
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{2}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{3}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{\textbf{5}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{\textbf{4}}};
\end{scope}
\begin{scope}[yshift=-1cm]
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{5}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{6}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{3}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{\textbf{2}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{5}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{{\color{red}{1}}}};
\end{scope}
\begin{scope}[yshift=-2cm]
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{2}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{\textbf{2}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{3}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{4}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{\textbf{5}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{{\color{red}{1}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{3}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{\textbf{4}}};
\end{scope}
}
\end{tikzpicture}
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\definecolor{ugreen}{rgb}{0,0.5,0}
%%% outline
%-------------------------------------------------------------------------
\vspace{0.3em}
\begin{tikzpicture}
\begin{scope}
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{{\color{ublue} 3}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{\textbf{{\color{red} 2}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{{\color{ublue} 3}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{\textbf{{\color{purple} 5}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{\textbf{1}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{\textbf{{\color{ublue} 3}}}};
\end{scope}
\begin{scope}[yshift=-1cm]
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{{\color{red} 2}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{1}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{{\color{purple} 5}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{\textbf{{\color{ublue} 3}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{1}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\end{scope}
\begin{scope}[yshift=-2cm]
\node[anchor=west,draw,very thick,minimum size=20pt] (s1) at (0,0) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s2) at ([xshift=0.2cm]s1.east) {\Large{\textbf{{\color{ublue} 3}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s3) at ([xshift=0.2cm]s2.east) {\Large{\textbf{{\color{red} 2}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s4) at ([xshift=0.2cm]s3.east) {\Large{\textbf{{\color{orange} 6}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s5) at ([xshift=0.2cm]s4.east) {\Large{\textbf{1}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s6) at ([xshift=0.2cm]s5.east) {\Large{\textbf{{\color{red} 2}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s7) at ([xshift=0.2cm]s6.east) {\Large{\textbf{{\color{ublue} 3}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s8) at ([xshift=0.2cm]s7.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s9) at ([xshift=0.2cm]s8.east) {\Large{\textbf{{\color{ugreen} 4}}}};
\node[anchor=west,draw,very thick,minimum size=20pt] (s10) at ([xshift=0.2cm]s9.east) {\Large{\textbf{1}}};
\end{scope}
\end{tikzpicture}
%---------------------------------------------------------------------
\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\begin{tikzpicture}
\begin{axis}[
width=11cm,
height=5.5cm,
xlabel={某语料库中的词汇},
ylabel={词汇出现总次数},
xlabel style={xshift=6.2cm,yshift=0.8cm,font=\footnotesize},
ylabel style={rotate=-90,yshift=2.8cm,xshift=1.2cm,font=\footnotesize},
xticklabel style={opacity=0},
ytick={0,10000000,20000000,30000000,40000000,50000000,60000000},
tick align=inside,
y tick style={opacity=0},
x tick style={opacity=0},
y axis line style={opacity=0},
ymajorgrids,
major grid style={dotted,draw=ublue},
ybar,
bar width=0.5,
axis x line*=bottom,
xmin=0,
xmax=105,
ymin=0,
ymax=60000000
]
\addplot[fill=blue!40,draw=blue!40] coordinates{(1,56000000)
(2,34000000)
(3,30000000)
(4,27000000)
(5,18000000)
(6,12000000)
(7,11000000)
(8,10000000)
(9,9000000)
(10,8600000)
(11,8200000)
(12,7800000)
(13,7400000)
(14,7000000)
(15,6600000)
(16,6200000)
(17,5800000)
(18,5400000)
(19,5000000)
(20,4800000)
(21,4600000)
(22,4400000)
(23,4200000)
(24,4000000)
(25,3800000)
(26,3600000)
(27,3400000)
(28,3200000)
(29,3000000)
(30,2950000)
(31,2900000)
(32,2850000)
(33,2800000)
(34,2750000)
(35,2700000)
(36,2650000)
(37,2600000)
(38,2550000)
(39,2500000)
(40,2450000)
(41,2400000)
(42,2350000)
(43,2300000)
(44,2250000)
(45,2200000)
(46,2150000)
(47,2100000)
(48,2050000)
(49,2000000)
(50,1970000)
(51,1940000)
(52,1910000)
(53,1880000)
(54,1850000)
(55,1820000)
(56,1790000)
(57,1760000)
(58,1730000)
(59,1700000)
(60,1670000)
(61,1640000)
(62,1610000)
(63,1580000)
(64,1550000)
(65,1520000)
(66,1490000)
(67,1460000)
(68,1430000)
(69,1400000)
(70,1370000)
(71,1340000)
(72,1310000)
(73,1280000)
(74,1250000)
(75,1220000)
(76,1190000)
(77,1160000)
(78,1130000)
(79,1100000)
(80,1070000)
(81,1040000)
(82,1010000)
(83,980000)
(84,950000)
(85,920000)
(86,890000)
(87,860000)
(88,830000)
(89,800000)
(90,770000)
(91,740000)
(92,710000)
(93,680000)
(94,650000)
(95,620000)
(96,590000)
(97,560000)
(98,530000)
(99,500000)
(100,470000)
(101,440000)
(102,410000)
(103,380000)
(104,350000)};
\end{axis}
\end{tikzpicture}
%---------------------------------------------------------------------
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
%\renewcommand\arraystretch{1.5}%将表格高度调整为1.5倍
\chapterimage{../Figures/fig-NEU-3.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 2
%----------------------------------------------------------------------------------------
\chapter{统计建模基础及$n$-gram语言模型}
\parinterval 机器翻译并非是一个孤立的系统,它依赖于很多模块,并且需要很多学科知识的融合。现在的机器翻译系统大多使用统计模型对翻译问题进行建模,同时也会用到一些自然语言处理工具来对不同语言的文字进行分析。因此,在正式开始机器翻译内容的介绍之前,本章将会对相关的基础知识进行概述,包括:概率论与统计建模基础、语言分析、语言建模等。
\parinterval 概率论与统计建模是机器翻译方法的基础。这里会对机器翻译所涉及的基本数学概念进行简要描述,确保后续使用到的数学工具是完备的。本章会重点关注如何利用统计建模的方式对自然语言处理问题进行描述,这种手段在统计机器翻译和神经机器翻译中会被使用。
\parinterval 语言分析部分将以汉语为例介绍词法和句法分析的基本概念。它们都是自然语言处理中的经典问题,而且在机器翻译中也会经常被使用。同样,本章会介绍这两个任务的定义和求解问题的思路。
\parinterval 语言建模是机器翻译中最常用的一种技术,它主要用于句子的生成和流畅度评价。本章会以传统统计语言模型为例,对语言建模的相关概念进行介绍。但是,这里并不深入探讨语言模型技术,在后面的章节中还会单独对神经网络语言模型等前沿技术进行讨论。
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\sectionnewpage
\section{数学基础}
\parinterval 为了便于后续内容的介绍,首先对本书中将会对概率和统计学概念以及线性代数相关概念进行说明。这些概念被广泛应用于科学和工程中,在神经机器翻译以及统计机器翻译中也大量使用了这些数学工具。因此,这里对相关概念进行简要介绍,以方便对本书后续内容的理解。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\subsection{概率论基础}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{随机变量和概率}
\parinterval 在自然界中,很多{\small\bfnew{事件}}\index{事件}(Event)\index{Event}是否会发生是不确定的。例如,明天会下雨、掷一枚硬币是正面朝上、扔一个骰子的点数是5$\cdots\cdots$这类事件可能会发生也可能不会发生。通过大量的重复试验,能发现其具有某种规律性的事件叫做{\small\sffamily\bfseries{随机事件}}\index{随机事件}
\parinterval {\small\sffamily\bfseries{随机变量}}\index{随机变量}(Random Variable)\index{Random Variable}是对随机事件发生可能状态的描述,是随机事件的数量表征。设$\Omega = \{ \omega \}$为一个随机试验的样本空间,$X=X(\omega)$就是定义在样本空间$\Omega$上的单值实数函数,即$X=X(\omega)$为随机变量,记为$X$。随机变量是一种能随机选取数值的变量,常用大写的英文字母或希腊字母表示,其取值通常用小写字母来表示。例如,用$A$ 表示一个随机变量,用$a$表示变量$A$的一个取值。根据随机变量可以选取的值的某些性质,可以将其划分为离散变量和连续变量。
\parinterval 离散变量是指在其取值区间内可以被一一列举,总数有限并且可计算的数值变量。例如,用随机变量$X$代表某次投骰子出现的点数,点数只可能取1$\sim$6这6个整数,$X$就是一个离散变量。
\parinterval 连续变量是在其取值区间内连续取值,无法被一一列举,具有无限个取值的变量。例如,图书馆的开馆时间是8:30-22:00,用$X$代表某人进入图书馆的时间,时间的取值范围是[8:30,22:00]这个时间区间,$X$就是一个连续变量。
\parinterval {\small\bfnew{概率}}\index{概率}(Probability)\index{Probability}是度量随机事件呈现其每个可能状态的可能性的数值,本质上它是一个测度函数\cite{mao-prob-book-2011}\cite{kolmogorov2018foundations}。概率的大小表征了随机事件在一次试验中发生的可能性大小。用$\textrm{P}(\cdot )$表示一个随机事件的可能性,即事件发生的概率。比如$\textrm{P}(\textrm{太阳从东方升起})$表示``太阳从东方升起的可能性'',同理,$\textrm{P}(A=B)$ 表示的就是``$A=B$'' 这件事的可能性。
\parinterval 在实际问题中,往往需要得到随机变量的概率值。但是,真实的概率值可能是无法准确知道的,这时就需要对概率进行{\small\sffamily\bfseries{估计}}\index{估计},得到的结果是概率的{\small\sffamily\bfseries{估计值}}\index{估计值}(Estimate)\index{Estimate}。在概率论中,一个很简单的方法是利用相对频度作为概率的估计值。如果$\{x_1,x_2,\dots,x_n \}$是一个试验的样本空间,在相同情况下重复试验$N$次,观察到样本$x_i (1\leq{i}\leq{n})$的次数为$n (x_i )$,那么$x_i$在这$N$次试验中的相对频率是$\frac{n(x_i )}{N}$。当$N$越来越大时,相对概率也就越来越接近真实概率$\textrm{P}(x_i)$,即$\lim_{N \to \infty}\frac{n(x_i )}{N}=\textrm{P}(x_i)$。 实际上,很多概率模型都等同于相对频度估计,比如,对于一个服从多项式分布的变量的极大似然估计就可以用相对频度估计实现。
%--------------------------------------------------------------------
\begin{table}[htp]
\centering
\caption{离散变量$A$的概率分布}
\begin{tabular}{c|c c c c c c}
\rule{0pt}{15pt} $A$ & $a_1=1$ & $a_2=2$ & $a_3=3$ & $a_4=4$ & $a_5=5$ & $a_6=6$\\
\hline
\rule{0pt}{15pt} $\textrm{P}_i$ & $\textrm{P}_1=\frac{4}{25}$ & $\textrm{P}_2=\frac{3}{25}$ & $\textrm{P}_3=\frac{4}{25}$ & $\textrm{P}_4=\frac{6}{25}$ & $\textrm{P}_5=\frac{3}{25}$ & $\textrm{P}_6=\frac{1}{25}$ \\
\end{tabular}
\label{tab:2-1}
\end{table}
%--------------------------------------------------------------------
\parinterval 概率函数是用函数形式给出离散变量每个取值发生的概率,其实就是将变量的概率分布转化为数学表达形式。如果把$A$看做一个离散变量,$a$看做变量$A$的一个取值,那么$\textrm{P}(A)$被称作变量$A$的概率函数,$\textrm{P}(A=a)$被称作$A = a$的概率值,简记为$\textrm{P}(a)$。例如,在相同条件下掷一个骰子50次,用$A$表示投骰子出现的点数这个离散变量,$a_i$表示点数的取值,$\textrm{P}_i$表示$A=a_i$的概率值。表\ref{tab:2-1}$A$的概率分布,给出了$A$的所有取值及其概率。
\parinterval 除此之外,概率函数$\textrm{P}(\cdot)$还具有非负性、归一性等特点。非负性是指,所有的概率函数$\textrm{P}(\cdot)$都必须是大于等于0的数值,概率函数中不可能出现负数,即$\forall{x},\textrm{P}{(x)}\geq{0}$。归一性,又称规范性,简单的说就是所有可能发生的事件的概率总和为1,即$\sum_{x}\textrm{P}{(x)}={1}$
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-probability-density-function&distribution-function}
\caption{一个概率密度函数(左)与其对应的分布函数(右)}
\label{fig:2-3}
\end{figure}
%-------------------------------------------
\parinterval 对于离散变量$A$$\textrm{P}(A=a)$是个确定的值,可以表示事件$A=a$的可能性大小;而对于连续变量,求在某个定点处的概率是无意义的,只能求其落在某个取值区间内的概率。因此,用{\small\sffamily\bfseries{概率分布函数}}\index{概率分布函数}$F(x)${\small\sffamily\bfseries{概率密度函数}}\index{概率密度函数}$f(x)$来统一描述随机变量取值的分布情况(如图\ref{fig:2-3})。概率分布函数$F(x)$表示取值小于等于某个值的概率,是概率的累加(或积分)形式。假设$A$是一个随机变量,$a$是任意实数,将函数$F(a)=\textrm{P}\{A\leq a\}$$-\infty<a<\infty $定义为$A$的分布函数。通过分布函数,可以清晰地表示任何随机变量的概率。
\parinterval 概率密度函数反映了变量在某个区间内的概率变化快慢,概率密度函数的值是概率的变化率,该连续变量的概率也就是对概率密度函数求积分得到的结果。设$f(x) \geq 0$是连续变量$X$的概率密度函数,$X$的分布函数就可以用如下公式定义:
\begin{eqnarray}
F(x)=\int_{-\infty}^x f(x)dx
\label{eq:2-1}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{联合概率、条件概率和边缘概率}
\parinterval {\small\sffamily\bfseries{联合概率}}\index{联合概率}(Joint Probability)\index{Joint Probability}是指多个事件共同发生,每个随机变量满足各自条件的概率,表示为$\textrm{P}(AB)$$\textrm{P}(A\cap{B})${\small\sffamily\bfseries{条件概率}}\index{条件概率}(Conditional Probability)\index{Conditional Probability}是指$A$$B$为任意的两个事件,在事件$A$已出现的前提下,事件$B$出现的概率,使用$\textrm{P}(B \mid A)$表示。
\parinterval 贝叶斯法则(见\ref{sec:2.2.3}小节)是条件概率计算时的重要依据,条件概率可以表示为
\begin{eqnarray}
\textrm{P}{(B|A)} & = & \frac{\textrm{P}(A\cap{B})}{\textrm{P}(A)} \nonumber \\
& = & \frac{\textrm{P}(A)\textrm{P}(B|A)}{\textrm{P}(A)} \nonumber \\
& = & \frac{\textrm{P}(B)\textrm{P}(A|B)}{\textrm{P}(A)}
\label{eq:2-2}
\end{eqnarray}
\parinterval {\small\sffamily\bfseries{边缘概率}}\index{边缘概率}(marginal probability)\index{marginal probability}是和联合概率对应的,它指的是$\textrm{P}(X=a)$$\textrm{P}(Y=b)$,即仅与单个随机变量有关的概率。对于离散随机变量$X$$Y$,如果知道$\textrm{P}(X,Y)$,则边缘概率$\textrm{P}(X)$可以通过求和的方式得到。对于$\forall x \in X $,有
\begin{eqnarray}
\textrm{P}(X=x)=\sum_{y} \textrm{P}(X=x,Y=y)
\label{eq:2-3}
\end{eqnarray}
\parinterval 对于连续变量,边缘概率$\textrm{P}(X)$需要通过积分得到,如下式所示
\begin{eqnarray}
\textrm{P}(X=x)=\int \textrm{P}(x,y)dy
\label{eq:2-4}
\end{eqnarray}
\parinterval 为了更好地区分条件概率、边缘概率和联合概率,这里用一个图形面积的计算来举例说明。如图\ref{fig:2-4}所示,矩形$A$代表事件$X$发生所对应的所有可能状态,矩形$B$代表事件$Y$发生所对应的所有可能状态,矩形$C$代表$A$$B$的交集,则
\begin{itemize}
\vspace{0.5em}
\item 边缘概率:矩形$A$或者矩形$B$的面积;
\vspace{0.5em}
\item 联合概率:矩形$C$的面积;
\vspace{0.5em}
\item 条件概率:联合概率/对应的边缘概率,如:$\textrm{P}(A \mid B)$=矩形$C$的面积/矩形B的面积。
\vspace{0.5em}
\end{itemize}
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-schematic-edge-probability&joint-probability}
\caption{$A$$B$$C$事件所对应概率的图形化表示}
\label{fig:2-4}
\end{figure}
%-------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{链式法则}
\parinterval 条件概率公式$\textrm{P}(A \mid B)=\textrm{P}(AB)/\textrm{P}(B)$反映了事件$B$发生的条件下事件$A$发生的概率。如果将其推广到三个事件$A$$B$$C$,为了计算$\textrm{P}(A,B,C)$,我们可以运用两次$\textrm{P}(A \mid B)=\textrm{P}(AB)/\textrm{P}(B)$,计算过程如下:
\begin{eqnarray}
\textrm{P}(A,B,C) & = & \textrm{P}(A \mid B ,C)\textrm{P}(B,C) \nonumber \\
& = & \textrm{P}(A \mid B,C)\textrm{P}(B \mid C)\textrm{P}(C)
\label{eq:2-5}
\end{eqnarray}
\parinterval 推广到$n$个事件,可以得到了链式法则的公式
\begin{eqnarray}
\textrm{P}(x_1,x_2,...,x_n)=\textrm{P}(x_1) \prod_{i=2}^n \textrm{P}(x_i \mid x_1,x_2,...,x_{i-1})
\label{eq:2-6}
\end{eqnarray}
\parinterval 下面的例子有助于更好的理解链式法则,如图\ref{fig:2-5}所示,$A$$B$$C$$D$\\ $E$分别代表五个事件,其中,$A$只和$B$有关,$C$只和$B$$D$有关,$E$只和$C$有关,$B$$D$不依赖其他任何事件。则$\textrm{P}(A,B,C,D,E)$的表达式如下式:
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-schematic-chain-rule}
\setlength{\belowcaptionskip}{-1cm}
\caption{事件$A$$B$$C$$D$$E$之间的关系图}
\label{fig:2-5}
\end{figure}
%-------------------------------------------
\begin{eqnarray}
& & \textrm{P}(A,B,C,D,E) \nonumber \\
&=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(A,B,C,D) \nonumber \\
&=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(A,B,C) \nonumber \\
&=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(C \mid A,B) \cdot \textrm{P}(A,B) \nonumber \\
&=&\textrm{P}(E \mid A,B,C,D) \cdot \textrm{P}(D \mid A,B,C) \cdot \textrm{P}(C \mid A,B) \cdot \textrm{P}(B \mid A) \cdot \textrm{P}(A)
\label{eq:2-7}
\end{eqnarray}
\parinterval 根据图\ref {fig:2-5} 易知$E$只和$C$有关,所以$\textrm{P}(E \mid A,B,C,D)=\textrm{P}(E \mid C)$$D$不依赖于其他事件,所以$\textrm{P}(D \mid A,B,C)=\textrm{P}(D)$$C$只和$B$$D$有关,所以$\textrm{P}(C \mid A,B)=\textrm{P}(C \mid B)$$B$不依赖于其他事件,所以$\textrm{P}(B \mid A)=\textrm{P}(B)$。最终化简可得:
\begin{eqnarray}
\textrm{P}(A,B,C,D,E)=\textrm{P}(E \mid C) \cdot \textrm{P}(D) \cdot \textrm{P}(C \mid B) \cdot \textrm{P}(B)\cdot \textrm{P}(A \mid B)
\label{eq:2-8}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{贝叶斯法则}\label{sec:2.2.3}
\parinterval 首先介绍一下全概率公式:{\small\bfnew{全概率公式}}\index{全概率公式}(Law of Total Probability)\index{Law of Total Probability}是概率论中重要的公式,它可以将一个复杂事件发生的概率分解成不同情况的小事件发生概率的和。这里先介绍一个概念——划分。集合$S$的一个划分事件为$\{B_1,...,B_n\}$是指它们满足$\bigcup_{i=1}^n B_i=S \textrm{}B_iB_j=\varnothing , i,j=1,...,n,i\neq j$。此时事件$A$的全概率公式可以被描述为:
\begin{eqnarray}
\textrm{P}(A)=\sum_{k=1}^n \textrm{P}(A \mid B_k)\textrm{P}(B_k)
\label{eq:2-9}
\end{eqnarray}
\parinterval 举个例子,小张从家到公司有三条路分别为$a$$b$$c$,选择每条路的概率分别为0.5,0.3,0.2。令:
\begin{itemize}
\vspace{0.5em}
\item $S_a$:小张选择$a$路去上班
\vspace{0.5em}
\item $S_b$:小张选择$b$路去上班
\vspace{0.5em}
\item $S_c$:小张选择$c$路去上班
\vspace{0.5em}
\item $S$:小张去上班
\vspace{0.5em}
\end{itemize}
\parinterval 显然,$S_a$$S_b$$S_c$$S$的划分。如果三条路不拥堵的概率分别为$\textrm{P}({S_{a}^{'}})$=0.2, $\textrm{P}({S_{b}^{'}})$=0.4,$\textrm{P}({S_{c}^{'}})$=0.7,那么事件$L$:小张上班没有遇到拥堵情况的概率就是:
\begin{eqnarray}
{\textrm{P}(L)} &=& {\textrm{P}( L| S_a )\textrm{P}(S_a )+\textrm{P}( L| S_b )\textrm{P}(S_b )+\textrm{P}( L| S_c )\textrm{P}(S_c )}\nonumber \\
& = &{\textrm{P}({S_{a}^{'}})\textrm{P}(S_a)+\textrm{P}({S_{b}^{'}})\textrm{P}(S_b)+\textrm{P}({S_{c}^{'}})\textrm{P}(S_c) }\nonumber \\
& = &{0.36}
\label{eq:2-10}
\end{eqnarray}
\parinterval {\small\sffamily\bfseries{贝叶斯法则}}\index{贝叶斯法则}(Bayes' rule)\index{Bayes' rule}是概率论中的一个经典公式,通常用于已知$\textrm{P}(A \mid B)$$\textrm{P}(B \mid A)$。可以表述为:设$\{B_1,...,B_n\}$$S$的一个划分,$A$为事件,则对于$i=1,...,n$,有如下公式
\begin{eqnarray}
\textrm{P}(B_i \mid A) & = & \frac {\textrm{P}(A B_i)} { \textrm{P}(A) } \nonumber \\
& = & \frac {\textrm{P}(A \mid B_i)\textrm{P}(B_i) } { \sum_{k=1}^n\textrm{P}(A \mid B_k)\textrm{P}(B_k) }
\label{eq:2-11}
\end{eqnarray}
\noindent 其中,等式右端的分母部分使用了全概率公式。由上式,也可以得到贝叶斯公式的另外两种写法:
\begin{eqnarray}
\textrm{P}(B \mid A) & = & \frac { \textrm{P}(A \mid B)\textrm{P}(B) } {\textrm{P}(A)} \nonumber \\
& = & \frac { \textrm{P}(A \mid B)\textrm{P}(B) } {\textrm{P}(A \mid B)\textrm{P}(B)+\textrm{P}(A \mid \bar{B}) \textrm{P}(\bar{B})}
\label{eq:2-12}
\end{eqnarray}
\parinterval 贝叶斯公式常用于根据已知的结果来推断使之发生的各因素的可能性。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{KL距离和熵}
%----------------------------------------------------------------------------------------
% NEW SUBSUBSUB-SECTION
%----------------------------------------------------------------------------------------
{\small\bfnew{信息熵}}
\parinterval {\small\bfnew{}}\index{}(Entropy)\index{Entropy}是热力学中的一个概念,同时也是对系统无序性的一种度量标准。在自然语言处理领域也会使用到信息熵这一概念,比如描述文字的信息量大小。一条信息的信息量可以被看作是这条信息的不确定性。如果需要确认一件非常不确定甚至于一无所知的事情,那么需要理解大量的相关信息才能进行确认;同样的,如果对某件事已经非常确定,那么就不需要太多的信息就可以把它搞清楚。如下就是两个例子,
\begin{example}
确定性和不确定性的事件
\qquad\qquad\quad``太阳从东方升起''
\qquad\qquad\quad``明天天气多云''
\label{eg:2-1}
\end{example}
\parinterval 在这两句话中,``太阳从东方升起''是一件确定性事件(在地球上),几乎不需要查阅更多信息就可以确认,因此这件事的信息熵相对较低;而``明天天气多云''这件事,需要关注天气预报,才能大概率确定这件事,它的不确定性很高,因而它的信息熵也就相对较高。因此,信息熵也是对事件不确定性的度量。进一步,定义{\small\bfnew{自信息}}\index{自信息}(Self-information)\index{Self-information}为一个事件$X$的自信息的表达式为:
\begin{eqnarray}
\textrm{I}(x)=-\log\textrm{P}(x)
\label{eq:2-13}
\end{eqnarray}
\noindent 其中,$\textrm{P}(x)$表示$x$发生的概率。自信息用来衡量单一事件发生时所包含的信息多少,当底数为e时,单位为nats,其中1nats是通过观察概率为$\frac{1}{e}$的事件而获得的信息量;当底数为2 时,单位为bits或shannons。$\textrm{I}(x)$$\textrm{P}(x)$的函数关系如图\ref{fig:2-6} 所示。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-self-information-function}
\caption{自信息函数$\textrm{I}(x)$关于$\textrm{P}(x)$的曲线}
\label{fig:2-6}
\end{figure}
%-------------------------------------------
\parinterval 自信息处理的是变量单一取值的情况。若量化整个概率分布中的不确定性或信息量,可以用信息熵,记为$\textrm{H}(x)$。其公式如下:
\begin{eqnarray}
\textrm{H}(x) & = & \sum_{x \in \textrm{X}}[ \textrm{P}(x) \textrm{I}(x)] \nonumber \\
& = & - \sum_{x \in \textrm{X} } [\textrm{P}(x)\log(\textrm{P}(x)) ]
\label{eq:2-14}
\end{eqnarray}
\parinterval 一个分布的信息熵也就是从该分布中得到的一个事件的期望信息量。比如,$a$$b$$c$$d$四支球队,四支队伍夺冠的概率分别是$P_1$$P_2$$P_3$$P_4$,某个人对比赛不感兴趣但是又想知道哪只球队夺冠,通过使用二分法2次就确定哪支球队夺冠了。但假设这四只球队中$c$的实力可以碾压其他球队,那么猜1次就可以确定。所以对于前面这种情况,哪只球队夺冠的信息量较高,信息熵也相对较高;对于后面这种情况,因为结果是容易猜到的,信息量和信息熵也就相对较低。因此可以得知:分布越尖锐熵越低;分布越均匀熵越高。
%----------------------------------------------------------------------------------------
% NEW SUBSUBSUB-SECTION
%----------------------------------------------------------------------------------------
{\small\bfnew{KL距离}}
\parinterval 如果同一个随机变量$X$上有两个概率分布P$(x)$和Q$(x)$,那么可以使用KL距离(``Kullback-Leibler''散度)来衡量这两个分布的不同,这种度量就是{\small\bfnew{相对熵}}\index{相对熵}(Relative Entropy)\index{Relative Entropy}。其公式如下:
\begin{eqnarray}
\textrm{D}_{\textrm{KL}}(\textrm{P}\parallel \textrm{Q}) & = & \sum_{x \in \textrm{X}} [ \textrm{P}(x)\log \frac{\textrm{P}(x) }{ \textrm{Q}(x) } ] \nonumber \\
& = & \sum_{x \in \textrm{X} }[ \textrm{P}(x)(\log\textrm{P}(x)-\log \textrm{Q}(x))]
\label{eq:2-15}
\end{eqnarray}
\parinterval 相对熵的意义在于:在一个事件空间里,概率分布$\textrm{P}(x)$对应的每个事件的可能性。若用概率分布Q$(x)$编码$\textrm{P}(x)$,平均每个事件的信息量增加了多少。它衡量的是同一个事件空间里两个概率分布的差异。KL距离有两条重要的性质:
\begin{itemize}
\vspace{0.5em}
\item 非负性,即$\textrm{D}_{\textrm{KL}} (\textrm{P} \parallel \textrm{Q}) \ge 0$,等号成立条件是$\textrm{P}$$\textrm{Q}$相等。
\vspace{0.5em}
\item 不对称性,即$\textrm{D}_{\textrm{KL}} (\textrm{P} \parallel \textrm{Q}) \neq \textrm{D}_{\textrm{KL}} (\textrm{Q} \parallel \textrm{P})$,所以$\textrm{KL}$距离并不是常用的欧式空间中的距离。为了消除这种不确定性,有时也会使用$\textrm{D}_{\textrm{KL}} (\textrm{P} \parallel \textrm{Q})+\textrm{D}_{\textrm{KL}} (\textrm{Q} \parallel \textrm{P})$作为度量两个分布差异性的函数。
\vspace{0.5em}
\end{itemize}
%----------------------------------------------------------------------------------------
% NEW SUBSUBSUB-SECTION
%----------------------------------------------------------------------------------------
{\small\bfnew{交叉熵}}
\parinterval {\small\bfnew{交叉熵}}\index{交叉熵}(Cross-entropy)\index{Cross-entropy}是一个与KL距离密切相关的概念,它的公式是:
\begin{eqnarray}
\textrm{H}(\textrm{P},\textrm{Q})=-\sum_{x \in \textrm{X}} [\textrm{P}(x) \log \textrm{Q}(x) ]
\label{eq:2-16}
\end{eqnarray}
\parinterval 结合相对熵公式可知,交叉熵是KL距离公式中的右半部分。因此,当概率分布P$(x)$固定时,求关于Q的交叉熵的最小值等价于求KL距离的最小值。从实践的角度来说,交叉熵与KL距离的目的相同:都是用来描述两个分布的差异,由于交叉熵计算上更加直观方便,因此在机器翻译中被广泛应用。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\subsection{线性代数基础}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{标量、向量和矩阵}
\vspace{-0.5em}
\parinterval {\small\sffamily\bfseries{标量}}\index{标量}(Scalar)\index{Scalar}:标量亦称``无向量'',是一种只具有数值大小而没有方向的量,通俗地说,一个标量就是一个单独的数,这里特指实数\footnote{严格意义上,标量可以是复数等其他形式。这里为了方便讨论,仅以实数为对象。}。一般用小写斜体表示标量。比如,对于$ a=5 $$ a $就是一个标量。
\parinterval {\small\sffamily\bfseries{向量}}\index{向量}(Vector)\index{Vector}:向量是由一组实数组成的有序数组。与标量不同,向量既有大小也有方向。可以把向量看作空间中的点,每个元素是不同坐标轴上的坐标。公式\ref{eq:5-1}和公式\ref{eq:5-2}展示了一个行向量和一个列向量。本章默认使用行向量,如$ \mathbf a=(a_1, a_2, a_3) $$ \mathbf a $对应的列向量记为$ \mathbf a^{\rm T} $
\begin{eqnarray}
\mathbf a &=& \begin{pmatrix}
1 & 2 & 5 & 7
\end{pmatrix}\label{eq:5-1}\\ \nonumber \\
\mathbf{a^{\textrm{T}}} &=& \begin{pmatrix}
&1& \\
&2&\\
&5& \\
&7&\end{pmatrix}
\label{eq:5-2}
\end{eqnarray}
\parinterval {\small\sffamily\bfseries{矩阵}}\index{矩阵}(Matrix)\index{Matrix}:矩阵是一个按照长方阵列排列的实数集合,最早来自于方程组的系数及常数所构成的方阵。在计算机领域,通常将矩阵看作二维数组。我们用粗体的符号$ \mathbf a $表示一个矩阵,如果该矩阵有$ m $$ n $列,那么有$ \mathbf a\in R^{m\times n} $。这里,用不加粗的符号来表示矩阵中的元素,其中每个元素都被一个行索引和一个列索引所确定。例如,$ a_{ij} $表示第$ i $行、第$ j $列的矩阵元素。如下,公式\ref{eq:5-3}$ \mathbf a $定义了一个2行2列的矩阵。
\begin{eqnarray}
\mathbf a & = & \begin{pmatrix}
a_{11} & a_{12}\\
a_{21} & a_{22}
\end{pmatrix} \nonumber \\
& = & \begin{pmatrix}
1 & 2\\
3 & 4
\end{pmatrix}
\label{eq:5-3}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{矩阵的转置}
\parinterval {\small\sffamily\bfseries{转置}}\index{转置}(Transpose)\index{Transpose}是矩阵的重要操作之一。矩阵的转置可以看作是将矩阵以对角线为镜像进行翻转:假设$ \mathbf a $$ m $$ n $列的矩阵,第$ i $行、第$ j $ 列的元素是$ a_{ij} $,即:$ \mathbf a={(a_{ij})}_{m\times n} $,把$ m\times n $矩阵$ \mathbf a $的行换成同序数的列得到一个$ n\times m $矩阵,则得到$ \mathbf a $的转置矩阵,记为$ \mathbf a^{\rm T} $,其中$ a_{ji}^{\rm T}=a_{ij} $。例如:
\begin{eqnarray}
\mathbf a & = & \begin{pmatrix} 1 & 3 & 2 & 6\\5 & 4 & 8 & 2\end{pmatrix} \\ \nonumber \\
{\mathbf a}^{\rm T} & = &\begin{pmatrix} 1 & 5\\3 & 4\\2 & 8\\6 & 2\end{pmatrix}
\end{eqnarray}
\parinterval 向量可以看作只有一行(列)的矩阵。对应地,向量的转置可以看作是只有一列(行)的矩阵。标量可以看作是只有一个元素的矩阵。因此,标量的转置等于它本身,即$ a^{\rm T}=a $
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{矩阵加法和数乘}
\parinterval 矩阵加法又被称作{\small\sffamily\bfseries{按元素加法}}\index{按元素加法}(Element-wise Addition)\index{Element-wise Addition}。它是指两个矩阵把其相对应元素加在一起的运算,通常的矩阵加法被定义在两个形状相同的矩阵上。两个$ m\times n $矩阵$ \mathbf a $$ \mathbf b $的和,标记为$ \mathbf a + \mathbf b $,它也是个$ m\times n $矩阵,其内的各元素为其相对应元素相加后的值。如果矩阵$ \mathbf c = \mathbf a + \mathbf b $,则$ c_{ij} = a_{ij} + b_{ij} $。公式\ref{eq:5-4}展示了矩阵之间进行加法的计算过程。
\begin{eqnarray}
\begin{pmatrix}
1 & 3\\
1 & 0\\
1 & 2
\end{pmatrix}\;\;+\;\;\begin{pmatrix}
0 & 0\\
7 & 5\\
2 & 1
\end{pmatrix}&=&\begin{pmatrix}
1+0 & 3+0\\
1+7 & 0+5\\
1+2 & 2+1
\end{pmatrix}\;\;=\;\;\begin{pmatrix}
1 & 3\\
8 & 5\\
3 & 3
\end{pmatrix}
\label{eq:5-4}
\end{eqnarray}
\parinterval 矩阵加法满足以下运算规律:
\begin{itemize}
\vspace{0.5em}
\item 交换律:$ \mathbf a+\mathbf b = \mathbf b +\mathbf a $
\vspace{0.5em}
\item 结合律:$ (\mathbf a+\mathbf b)+\mathbf c = \mathbf a+(\mathbf b+\mathbf c) $
\vspace{0.5em}
\item $ \mathbf a+\mathbf 0=\mathbf a $,其中$ \mathbf 0 $指的是零矩阵,即元素皆为0的矩阵。
\vspace{0.5em}
\item $ \mathbf a+(-\mathbf a)=\mathbf 0 $,其中$ -\mathbf a $是矩阵$ \mathbf a $的负矩阵,即将矩阵$ \mathbf a $的每个元素取负得到的矩阵。
\vspace{0.5em}
\end{itemize}
\parinterval 矩阵的{\small\bfnew{数乘}}\index{数乘}(Scalar Multiplication)\index{Scalar Multiplication}是指标量(实数)与矩阵的乘法运算,计算过程是将标量与矩阵的每个元素相乘,最终得到与原矩阵形状相同的矩阵。例如,矩阵$ \mathbf a={(a_{ij})}_{m\times n} $与标量$ k $进行数乘运算,其结果矩阵$ \mathbf b={(ka_{ij})}_{m\times n} $,即$ k{(a_{ij})}_{m\times n}={(ka_{ij})}_{m\times n} $。下面的式子展示了矩阵数乘的计算过程:
\begin{eqnarray}
\mathbf a & = &
\begin{pmatrix}
3 & 2 & 7\\
5 & 8 & 1
\end{pmatrix}
\\ \nonumber \\
2\mathbf a & = &
\begin{pmatrix}
6 & 4 & 14\\
10 & 16 & 2
\end{pmatrix}
\label{eq:5-5}
\end{eqnarray}
\parinterval 矩阵的数乘满足以下运算规律,其中$ k $$ l $是实数,$ \mathbf a $$ \mathbf b $是形状相同的矩阵:
\begin{itemize}
\vspace{0.5em}
\item 右分配律:$ k(\mathbf a+\mathbf b)=k\mathbf a+k\mathbf b $
\vspace{0.5em}
\item 左分配律:$ (k+l)\mathbf a=k\mathbf a+l\mathbf a $
\vspace{0.5em}
\item 结合律:$ (kl)\mathbf a=k(l\mathbf a) $
%\vspace{0.5em}
\end{itemize}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{矩阵乘法和矩阵点乘}
\parinterval 矩阵乘法是矩阵运算中最重要的操作之一,为了与矩阵点乘区分,通常也把矩阵乘法叫做矩阵叉乘。假设$ \mathbf a $$ m\times p $的矩阵,$ \mathbf b $$ p\times n $的矩阵,对$ \mathbf a $$ \mathbf b $作矩阵乘法的结果是一个$ m\times n $的矩阵$ \mathbf c $,其中矩阵$ \mathbf c $中第$ i $行、第$ j $列的元素可以表示为:
\begin{eqnarray}
{(\mathbf a\mathbf b)}_{ij} &=& \sum_{k=1}^p a_{ik}b_{kj}
\label{eq:5-6}
\end{eqnarray}
\parinterval 只有当第一个矩阵的列数与第二个矩阵的行数相等时,两个矩阵才可以作矩阵乘法。公式\ref{eq:5-7}展示了矩阵乘法的运算过程,若$\mathbf a=\begin{pmatrix}a_{11} & a_{12}& a_{13}\\a_{21} & a_{22} & a_{23}\end{pmatrix}$$\mathbf b=\begin{pmatrix}b_{11} & b_{12}\\b_{21} & b_{22}\\b_{31} & b_{32}\end{pmatrix} $,则有:
\vspace{-0.5em}
\begin{eqnarray}
\mathbf c & = & \mathbf a\mathbf b \nonumber \\
& = & \begin{pmatrix}
a_{11}b_{11}+a_{12}b_{21}+a_{13}b_{31} & a_{11}b_{12}+a_{12}b_{22}+a_{13}b_{32}\\
a_{21}b_{11}+a_{22}b_{21}+a_{23}b_{31} & a_{21}b_{12}+a_{22}b_{22}+a_{23}b_{32}
\end{pmatrix}
\label{eq:5-7}
\end{eqnarray}
\parinterval 矩阵乘法满足以下运算规律:
\begin{itemize}
\vspace{0.5em}
\item 结合律:若$ \mathbf a\in R^{m\times n} $$ \mathbf b\in R^{n\times p} $$ \mathbf c\in R^{p\times q} $,则$ (\mathbf {ab})\mathbf c=\mathbf a(\mathbf {bc}) $
\vspace{0.5em}
\item 左分配律:若$ \mathbf a\in R^{m\times n} $$ \mathbf b\in R^{m\times n} $$ \mathbf c\in R^{n\times p} $,则$ (\mathbf a+\mathbf b)\mathbf c=\mathbf {ac}+\mathbf {bc} $
\vspace{0.5em}
\item 右分配律:若$ \mathbf a\in R^{m\times n} $$ \mathbf b\in R^{n\times p} $$ \mathbf c\in R^{n\times p} $,则$ \mathbf a(\mathbf b+\mathbf c)=\mathbf {ab}+\mathbf {ac} $
\vspace{0.5em}
\end{itemize}
\begin{spacing}{1.4}
\parinterval 可以将线性方程组用矩阵乘法表示,如对于线性方程组$ \begin{cases} 5x_1+2x_2=y_1\\3x_1+x_2=y_2\end{cases} $,可以表示为$ \mathbf {ax}^{\rm T}=\mathbf y^{\rm T}$,其中$ \mathbf a = \begin{pmatrix} 5 & 2\\3 & 1\end{pmatrix} $$ \mathbf x^{\rm T} = \begin{pmatrix} x_1\\x_2\end{pmatrix} $$ \mathbf y^{\rm T} = \begin{pmatrix} y_1\\y_2\end{pmatrix} $
\end{spacing}
\parinterval 矩阵的点乘就是两个形状相同的矩阵各个对应元素相乘,矩阵点乘也被称为{\small\bfnew{按元素乘积}}\index{按元素乘积}(Element-wise Product)\index{Element-wise Product}或Hadamard乘积,记为$ \mathbf a \odot \mathbf b$。例如,对于
\begin{eqnarray}
\mathbf a &=&
\begin{pmatrix}
1 & 0\\
-1 & 3
\end{pmatrix}
\\ \nonumber \\
\mathbf b &=&
\begin{pmatrix}
3 & 1\\
2 & 1
\end{pmatrix}
\end{eqnarray}
\parinterval 矩阵点乘的计算如下:
\begin{eqnarray}
\mathbf c & = & \mathbf a\odot \mathbf b \nonumber \\
& = & \begin{pmatrix}
1\times 3 & 0\times1\\
-1\times2 & 3\times1
\end{pmatrix}
\label{eq:5-8}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{线性映射}
\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}( Linear Mapping)\index{Linear Mapping}{\small\sffamily\bfseries{线性变换}}\index{线性变换}(Linear Transformation)\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$,且该映射函数保持加法运算和数量乘法运算,即对于空间V中任何两个向量$ \mathbf u $$ \mathbf v $以及任何标量$ c $,有:
\begin{eqnarray}
f(\mathbf u+\mathbf v)&=&f(\mathbf u)+f(\mathbf v)\label{eq:5-9}\\
f(c\mathbf v)&=&cf(\mathbf v)
\label{eq:5-10}
\end{eqnarray}
\parinterval 利用矩阵$ \mathbf a\in R^{m\times n} $,可以实现两个有限维欧氏空间的映射函数$f:R^n\rightarrow R^m$。例如$ n $维列向量$ \mathbf x ^{\rm T}$$ m\times n $的矩阵$ \mathbf a $,向量$ \mathbf x ^{\rm T}$左乘矩阵$ \mathbf a $,可将向量$ \mathbf x ^{\rm T}$映射为$ m $列向量,对于
\begin{eqnarray}
\mathbf x^{\textrm{T}} & = & {\begin{pmatrix} x_1, & x_2, & \dots &, x_n \end{pmatrix}}^{\rm T}
\label{eq:5-11}
\end{eqnarray}
\begin{eqnarray}
\mathbf a&=&
\begin{pmatrix}
a_{11} & a_{12} & \dots & a_{1n}\\
a_{21} & \dots & \dots & \dots \\
\dots & \dots & \dots & \dots \\
a_{m1} & \dots & \dots & a_{mn}
\end{pmatrix}
\label{eq:5-12}
\end{eqnarray}
\parinterval 可以得到:
\begin{eqnarray}
\mathbf y^{\textrm{T}}& = &\mathbf a\mathbf x^{\textrm{T}} \nonumber \\
& = &
\begin{pmatrix}
a_{11}x_{1}+a_{12}x_{2}+\dots+a_{1n}x_{n}\\
a_{21}x_{1}+a_{22}x_{2}+\dots+a_{2n}x_{n}\\
\vdots \\
a_{m1}x_{1}+a_{m2}x_{2}+\dots+a_{mn}x_{n}
\label{eq:5-13}\end{pmatrix}
\end{eqnarray}
\parinterval 上例中矩阵$ \mathbf a $定义了一个从$ R^n $$ R^m $的线性映射:向量$ \mathbf x^{\textrm{T}}\in R^n $$ \mathbf y^{\textrm{T}}\in R^m $别为两个空间中的列向量,即大小为$ n\times 1 $$ m\times 1 $ 的矩阵。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{范数}
\parinterval 工程领域,经常会使用被称为{\small\bfnew{范数}}\index{范数}(Norm)\index{Norm}的函数衡量向量大小,范数为向量空间内的所有向量赋予非零的正长度或大小。对于一个$n$维向量$ \mathbf x $,一个常见的范数函数为$ l_p $ 范数,通常表示为$ {\Vert{\mathbf x}\Vert}_p $ ,其中$p\ge 0$,是一个标量形式的参数。常用的$ p $的取值有$ 1 $$ 2 $$ \infty $等。范数的计算公式为:
\begin{eqnarray}
l_p(\mathbf x) & = & {\Vert{\mathbf x}\Vert}_p \nonumber \\
& = & {\left (\sum_{i=1}^{n}{{\vert x_{i}\vert}^p}\right )}^{\frac{1}{p}}
\label{eq:5-14}
\end{eqnarray}
\parinterval $ l_1 $范数为向量的各个元素的绝对值之和:
\begin{eqnarray}
{\Vert{\mathbf x}\Vert}_1&=&\sum_{i=1}^{n}{\vert x_{i}\vert}
\label{eq:5-15}
\end{eqnarray}
\parinterval $ l_2 $范数为向量的各个元素平方和的二分之一次方:
\begin{eqnarray}
{\Vert{\mathbf x}\Vert}_2&=&\sqrt{\sum_{i=1}^{n}{{x_{i}}^2}} \nonumber \\
&=&\sqrt{{\mathbf x}^{\rm T}\mathbf x}
\label{eq:5-16}
\end{eqnarray}
\parinterval $ l_2 $范数被称为{\small\bfnew{欧几里得范数}}\index{欧几里得范数}(Euclidean Norm)\index{Euclidean Norm}。从几何角度,向量也可以表示为从原点出发的一个带箭头的有向线段,其$ l_2 $范数为线段的长度,也常被称为向量的模。$ l_2 $ 范数在机器学习中非常常用,向量$ \mathbf x $$ l_2 $范数经常简化为$ \Vert{\mathbf x}\Vert $,可以简单地通过点积$ {\mathbf x}^{\rm T}\mathbf x $计算。
\parinterval $ l_{\infty} $范数为向量的各个元素的最大绝对值:
\begin{eqnarray}
{\Vert{\mathbf x}\Vert}_{\infty}&=&{\rm{max}}\{x_1,x_2,\dots,x_n\}
\label{eq:5-17}
\end{eqnarray}
\parinterval 广义上讲,范数是将向量映射到非负值的函数,其作用是衡量向量$ \mathbf x $到坐标原点的距离。更严格的说,范数并不拘于$ l_p $范数,任何一个同时满足下列性质的函数都可以作为范数:
\begin{itemize}
\vspace{0.5em}
\item$ f(x)=0 $,则$ x=0 $
\vspace{0.5em}
\item 三角不等式:$ f(x+y)\leqslant f(x)+f(y) $
\vspace{0.5em}
\item 任意实数$ \alpha $$ f(\alpha x)=\vert \alpha \vert f(x) $
\vspace{0.5em}
\end{itemize}
\parinterval 在深度学习中,有时候希望衡量矩阵的大小,这时可以考虑使用 {\small\bfnew{Frobenius 范数}}\index{Frobenius 范数}(Frobenius Norm)\index{Frobenius Norm}。计算方式为:
\begin{eqnarray}
{\Vert{\mathbf a}\Vert}_F&=&\sqrt{\sum_{i,j} a_{i,j}^2}
\label{eq:5-18}
\end{eqnarray}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\sectionnewpage
\section{掷骰子游戏}
\parinterval 上述过程的核心在于从数据中学习一种对分词现象的统计描述,即学习函数$\textrm{P}(\cdot)$。如何让计算机利用分词好的数据学习到分词的知识呢?可以先看一个有趣的实例(图\ref{fig:2-11}),用生活中比较常见的掷骰子来说,掷一个骰子,玩家猜一个数字,猜中就算赢,按照一般的常识,随便选一个数字,获胜的概率是一样的,即所有选择的获胜概率仅是$1/6$。因此这个游戏玩家很难获胜,除非运气很好。假设进行一次游戏,玩家随便选了一个数字,比如是1,投掷30次骰子,得到命中$7/30 > 1/6$,还不错。
\vspace{-0.5em}
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-the-dice-game}
%\setlength{\belowcaptionskip}{-0.5cm}
\caption{骰子结果}
\label{fig:2-11}
\end{figure}
%-------------------------------------------
\vspace{-0.5em}
\parinterval 似乎玩家的胜利只能来源于运气。不过,请注意,这里的假设``随便选一个数字''本身就是一个概率模型,它对骰子的六个面的出现做了均匀分布假设。
\begin{eqnarray}
\textrm{P(``1'')}=\textrm{P(``2'')}=...=\textrm{P(``5'')}=\textrm{P(``6'')}=1/6
\label{eq:2-17}
\end{eqnarray}
\vspace{-0.5em}
\parinterval 但是这个游戏没有人规定骰子是均匀的(有些被坑了的感觉)。如果骰子的六个面不均匀呢?我们可以用一种更加``聪明''的方式定义一种新的模型,即定义骰子的\\ \\ \\每一个面都以一定的概率出现,而不是相同的概率。描述如下:
\begin{eqnarray}
\textrm{P(``1'')} &=&\theta_1 \nonumber \\
\textrm{P(``2'')} &=&\theta_2 \nonumber \\
\textrm{P(``3'')} &=&\theta_3 \nonumber \\
\textrm{P(``4'')} &=&\theta_4 \nonumber \\
\textrm{P(``5'')} &=&\theta_5 \nonumber \\
\textrm{P(``6'')} &=&1-\sum_{1 \leq i \leq 5}\theta_i \qquad \lhd \textrm {归一性}
\label{eq:2-18}
\end{eqnarray}
\noindent 这里,$\theta_1 \sim \theta_5$可以被看作是模型的参数,因此这个模型的自由度是5。对于这样的模型,参数确定了,模型也就确定了。但是,新的问题来了,在定义骰子每个面的概率后,如何求出具体的值呢?一种常用的方法是,从大量实例中学习模型参数,这个方法也是常说的{\small\bfnew{参数估计}}\index{参数估计}(Parameter Estimation)\index{Parameter Estimation}。可以将这个不均匀的骰子先实验性的掷很多次,这可以被看作是独立同分布的若干次采样,比如$X$ 次,发现``1'' 出现$X_1$ 次,``2'' 出现$X_2$ 次,以此类推,得到了各个面出现的次数。假设掷骰子中每个面出现的概率符合多项式分布,通过简单的概率论知识可以知道每个面出现概率的极大似然估计为:
\begin{eqnarray}
\textrm{P(``i'')}=\frac {X_i}{X}
\label{eq:2-19}
\end{eqnarray}
\parinterval$X$足够大的时,$\frac{X_i}{X}$可以无限逼近P(``$i$'')的真实值,因此可以通过大量的实验推算出掷骰子各个面的概率的准确估计值。回归到原始的问题,如果在正式开始游戏前,预先掷骰子30次,得到如图\ref{fig:2-12}的结果。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-the-dice-game2}
\caption{实验性投掷骰子的结果}
\label{fig:2-12}
\end{figure}
%-------------------------------------------
\parinterval 于是,我们看到了一个有倾向性的模型(图 \ref{fig:2-13}):在这样的预先实验基础上,可以知道如果再次玩掷骰子游戏的话,选则数字``4''获胜的可能性是最大的。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-the-dice-game-model}
\caption{投骰子模型}
\label{fig:2-13}
\end{figure}
%-------------------------------------------
\parinterval 通过上面这个掷骰子的游戏,可以得到一个道理:{\small\sffamily\bfseries{上帝是不公平的}}。因为在``公平''的世界中,没有任何一个模型可以学到有价值的事情。从机器学习的角度来看,所谓的``不公平''实际上这是客观事物中蕴含的一种{\small\sffamily\bfseries{偏置}}\index{偏置}(Bias)\index{Bias},也就是很多事情天然就有对某些情况有倾向。而图像处理、自然语言处理等问题中绝大多数都存在着偏置。比如,我们翻译一个英文单词的时候,它最可能的翻译结果往往就是那几个词。设计统计模型的目的正是要学习这种偏置,之后利用这种偏置对新的问题做出足够好的决策。
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\sectionnewpage
\section{$n$-gram语言模型 }
\parinterval 在基于统计的汉语分词模型中,我们通过``大题小做''的技巧,利用独立性假设把整个句子的单词切分概率转化为每个单个词出现概率的乘积。这里,每个单词也被称作1-gram(或uni-gram),而1-gram概率的乘积实际上也是在度量词序列出现的可能性(记为$\textrm{P}(w_1 w_2...w_m)$)。这种计算整个单词序列概率$\textrm{P}(w_1 w_2...w_m)$的方法被称为统计语言模型。1-gram语言模型是最简单的一种语言模型,它没有考虑任何的上下文。很自然的一个问题是:能否考虑上下文信息构建更强大的语言模型,进而得到更准确的分词结果。下面将进一步介绍更加通用的$n$-gram语言模型,它在机器翻译及其他自然语言处理任务中有更加广泛的应用。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\subsection{建模}
\parinterval {\small\sffamily\bfseries{语言模型}}\index{语言模型}(Language Model)\index{Language Model}的目的是描述文字序列出现的规律。这个对问题建模的过程被称作{\small\sffamily\bfseries{语言建模}}\index{语言建模}(Language Modeling)\index{Language Modeling}。如果使用统计建模的方式,语言模型可以被定义为计算$\textrm{P}(w_1 w_2...w_m)$的问题,也就是计算整个词序列$w_1 w_2...w_m$出现的可能性大小。具体定义如下,
%----------------------------------------------
% 定义3.1
\vspace{0.5em}
\begin{definition}[]
词汇表V上的语言模型是一个函数$\textrm{P}(w_1 w_2...w_m)$,它表示$V^+$上的一个概率分布。其中,对于任何词串$w_1 w_2...w_m\in{V^+}$,有$\textrm{P}(w_1 w_2...w_m)\geq{0}$。而且对于所有的词串,函数满足归一化条件$\sum{_{w_1 w_2...w_m\in{V^+}}\textrm{P}(w_1 w_2...w_m)}=1$
\end{definition}
%-------------------------------------------
\parinterval 直接求$\textrm{P}(w_1 w_2...w_m)$并不简单,因为如果把整个词串$w_1 w_2...w_m$作为一个变量,模型的参数量会非常大。$w_1 w_2...w_m$$|V|^m$种可能性,这里$|V|$表示词汇表大小。显然,当$m$ 增大时,模型的复杂度会急剧增加,甚至都无法进行存储和计算。既然把$w_1 w_2...w_m$作为一个变量不好处理,就可以考虑对这个序列的生成过程进行分解。使用链式法则,很容易得到
\begin{eqnarray}
\textrm{P}(w_1 w_2...w_m)=\textrm{P}(w_1)\textrm{P}(w_2|w_1)\textrm{P}(w_3|w_1 w_2)...\textrm{P}(w_m|w_1 w_2...w_{m-1})
\label{eq:2-22}
\end{eqnarray}
这样,$w_1 w_2...w_m$的生成可以被看作是逐个生成每个单词的过程,即首先生成$w_1$,然后根据$w_1$再生成$w_2$,然后根据$w_1 w_2$再生成$w_3$,以此类推,直到根据所有前$m-1$个词生成序列的最后一个单词$w_m$。这个模型把联合概率$\textrm{P}(w_1 w_2...w_m)$分解为多个条件概率的乘积,虽然对生成序列的过程进行了分解,但是模型的复杂度和以前是一样的,比如,$\textrm{P}(w_m|w_1 w_2...w_{m-1})$ 仍然不好计算。
\parinterval 换一个角度看,$\textrm{P}(w_m|w_1 w_2...w_{m-1})$体现了一种基于``历史''的单词生成模型,也就是把前面生成的所有单词作为``历史'',并参考这个``历史''生成当前单词。但是这个``历史''的长度和整个序列长度是相关的,也是一种长度变化的历史序列。为了化简问题,一种简单的想法是使用定长历史,比如,每次只考虑前面$n-1$个历史单词来生成当前单词,这就是$n$-gram语言模型。这个模型的数学描述如下:
\begin{eqnarray}
\textrm{P}(w_m|w_1 w_2...w_{m-1}) \approx \textrm{P}(w_m|w_{m-n+1}...w_{m-1})
\label{eq:2-23}
\end{eqnarray}
\parinterval 这样,整个序列$w_1 w_2...w_m$的生成概率可以被重新定义为:\\ \\ \\
\vspace{0.5em}
\begin{center}
{\footnotesize
\begin{tabular}{l|l|l l|l}
链式法则 & 1-gram & 2-gram & $...$ & $n$-gram\\
\hline
\rule{0pt}{10pt} $\textrm{P}(w_1 w_2...w_m)$ = & $\textrm{P}(w_1 w_2...w_m)$ = & $\textrm{P}(w_1 w_2...w_m)$ = & $...$ & $\textrm{P}(w_1 w_2...w_m)$ = \\
\rule{0pt}{10pt} $\textrm{P}(w_1)\times$ & $\textrm{P}(w_1)\times$ & $\textrm{P}(w_1)\times$ & $...$ & $\textrm{P}(w_1)\times$ \\
\rule{0pt}{10pt} $\textrm{P}(w_2|w_1)\times$ & $\textrm{P}(w_2)\times$ & $\textrm{P}(w_2|w_1)\times$ & $...$ & $\textrm{P}(w_2|w_1)\times$\\
\rule{0pt}{10pt} $\textrm{P}(w_3|w_1 w_2)\times$ & $\textrm{P}(w_3)\times$ & $\textrm{P}(w_3|w_2)\times$ & $...$ & $\textrm{P}(w_3|w_1 w_2)\times$ \\
\rule{0pt}{10pt} $\textrm{P}(w_4|w_1 w_2 w_3)\times$ & $\textrm{P}(w_4)\times$ & $\textrm{P}(w_4|w_3)\times$ & $...$ & $\textrm{P}(w_4|w_1 w_2 w_3)\times$ \\
\rule{0pt}{10pt} $...$ & $...$ & $...$ & $...$ & $...$ \\
\rule{0pt}{10pt} $\textrm{P}(w_m|w_1 ... w_{m-1})$ & $\textrm{P}(w_m)$ & $\textrm{P}(w_m|w_{m-1})$ & $...$ & $\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$
\end{tabular}
}
\end{center}
\vspace{-1.5em}
\parinterval 可以看到,1-gram语言模型只是$n$-gram语言模型的一种特殊形式。$n$-gram的优点在于,它所使用的历史信息是有限的,即$n-1$个单词。这种性质也反映了经典的马尔可夫链的思想\cite{liuke-markov-2004}\cite{resnick1992adventures},有时也被称作马尔可夫假设或者马尔可夫属性。因此$n$-gram也可以被看作是变长序列上的一种马尔可夫模型,比如,2-gram语言模型对应着1阶马尔可夫模型,3-gram语言模型对应着2阶马尔可夫模型,以此类推。
\parinterval 那么,如何计算$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$呢?有很多种选择,比如:
\begin{itemize}
\vspace{0.5em}
\item {\small\bfnew{极大似然估计}}\index{极大似然估计}。直接利用词序列在训练数据中出现的频度计算出$\textrm{P}(w_m|w_{m-n+1}$\\$... w_{m-1})$
\begin{eqnarray}
\textrm{P}(w_m|w_{m-n+1}...w_{m-1})=\frac{\textrm{count}(w_{m-n+1}...w_m)}{\textrm{count}(w_{m-n+1}...w_{m-1})}
\label{eq:2-24}
\vspace{0.5em}
\end{eqnarray}
其中,$\textrm{count}(\cdot)$是在训练数据中统计频次的函数。
\vspace{0.5em}
\item {\small\bfnew{人工神经网络方法}}\index{人工神经网络方法}。构建一个人工神经网络估计$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$的值,比如,可以构建一个前馈神经网络来对$n$-gram进行建模。
\end{itemize}
\vspace{0.5em}
\parinterval 极大似然估计方法和前面介绍的统计分词中的方法是一致的,它的核心是使用$n$-gram出现的频度进行参数估计,因此也是自然语言处理中一类经典的$n$-gram方法。基于人工神经网络的方法在近些年也非常受关注,它直接利用多层神经网络对问题的输入$(w_{m-n+1}...w_{m-1})$和输出$\textrm{P}(w_m|w_{m-n+1} ... w_{m-1})$进行建模,而模型的参数通过网络中神经元之间连接的权重进行体现。严格意义上了来说,基于人工神经网络的方法并不算基于$n$-gram的方法,或者说它并没有显性记录$n$-gram的生成概率,也不依赖$n$-gram的频度进行参数估计。为了保证内容的连贯性,本章将仍以传统$n$-gram语言模型为基础进行讨论,基于人工神经网络的方法将会在第五章和第六章进行详细介绍。
\parinterval $n$-gram语言模型的使用非常简单。可以像\ref{sec2:statistical-seg}节中一样,直接用它来对词序列出现的概率进行计算。比如,可以使用一个2-gram语言模型计算一个分词序列的概率:
\begin{eqnarray}
& &\textrm{P}_{2-gram}{(\textrm{``确实}/\textrm{现在}/\textrm{数据}/\textrm{}/\textrm{多''})} \nonumber \\
&= & \textrm{P}(\textrm{``确实''}) \times\textrm{P}(\textrm{``现在''}|\textrm{``确实''})\times\textrm{P}(\textrm{``数据''}|\textrm{``现在''}) \times \nonumber \\
& & \textrm{P}(\textrm{``很''}|\textrm{``数据''})\times\textrm{P}(\textrm{``多''}|\textrm{``很''})
\label{eq:2-25}
\end{eqnarray}
\parinterval$n$-gram语言模型为代表的统计语言模型的应用非常广泛。除了分词,在文本生成、信息检索、摘要等自然语言处理任务中,语言模型都有举足轻重的地位。包括近些年非常受关注的预训练模型,本质上也是统计语言模型。这些技术都会在后续章节进行介绍。值得注意的是,统计语言模型为解决自然语言处理问题提供了一个非常好的建模思路,即:把整个序列生成的问题转化为逐个生成单词的问题。很快我们就会看到,这种建模方式会被广泛地用于机器翻译建模,在统计机器翻译和神经机器翻译中都会有明显的体现。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
%----------------------------------------------------------------------------------------
\subsection{未登录词和平滑算法}\label{sec2:smoothing}
\parinterval 在式\ref{eq:2-25}所示的例子中,如果语料中从没有``确实''和``现在''两个词连续出现的情况,那么使用2-gram计算切分``确实/现在/数据/很/多''的概率时,会出现如下情况
\begin{eqnarray}
\textrm{P}(\textrm{``现在''}|\textrm{``确实''}) & = & \frac{\textrm{count}(\textrm{``确实}\,\textrm{现在''})}{\textrm{count}(\textrm{``确实''})} \nonumber \\
& = & \frac{0}{\textrm{count}(\textrm{``确实''})} \nonumber \\
& = & 0
\label{eq:2-26}
\end{eqnarray}
\parinterval 显然,这个结果是不能接受的。因为即使语料中没有 ``确实''和``现在''两个词连续出现,这种搭配也是客观存在的。这时简单的用极大似然估计得到概率却是0,导致整个切分结果的概率为0。 更常见的问题是那些根本没有出现在词表中的词,称为{\small\sffamily\bfseries{未登录词}}\index{未登录词}(Out-of-Vocabulary Word,OOV Word)\index{Out-of-Vocabulary Word,OOV Word},比如一些生僻词,可能模型训练阶段从来没有看到过,这时模型仍然会给出0 概率。图\ref{fig:2-18}展示了一个真实语料库中词语出现频度的分布,可以看到绝大多数词都是低频词。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-word-frequency-distribution}
\caption{词语频度的分布}
\label{fig:2-18}
\end{figure}
%---------------------------
\parinterval 为了解决未登录词引起的零概率问题,常用的做法是对模型进行平滑处理,也就是给可能出现的情况一个非零的概率,使得模型不会对整个序列给出零概率。平滑可以用``劫富济贫''这一思想理解,在保证所有情况的概率和为1的前提下,使极低概率的部分可以从高概率的部分分配到一部分概率,从而达到平滑的目的。
\parinterval 语言模型使用的平滑算法有很多。在本节中,主要介绍三种平滑方法:加法平滑法、古德-图灵估计法和Kneser-Ney平滑。这些方法也可以被应用到其他任务的概率平滑操作中。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{加法平滑方法}
\parinterval {\small\bfnew{加法平滑}}\index{加法平滑}(Additive Smoothing)\index{Additive Smoothing}是一种简单的平滑技术。本小节首先介绍这一方法,希望通过它了解平滑算法的思想。通常情况下,系统研发者会利用采集到的语料库来模拟真实的全部语料库。当然,没有一个语料库能覆盖所有的语言现象。常见的一个问题是,使用的语料无法涵盖所有的词汇。因此,直接依据这样语料所获得的统计信息来获取语言模型就会产生偏差。假设依据某语料$C$ (从未出现`` 确实 现在''二元语法),评估一个已经分好词的句子$S$ =``确实/现在/物价/很/高''的概率。当计算``确实/现在''的概率时,$\textrm{P}(S) = 0$。显然这个结果是不合理的。
\parinterval 加法平滑方法假设每个$n$-gram出现的次数比实际统计次数多$\theta$次,$0 \le \theta\le 1$。这样,计算概率的时候分子部分不会为0。重新计算$\textrm{P}(\textrm{现在}|\textrm{确实})$,可以得到:
\begin{eqnarray}
\textrm{P}(\textrm{``现在''}|\textrm{``确实''}) & = & \frac{\theta + \textrm{count}(\textrm{``确实''/``现在''})}{\sum_{w}^{|V|}(\theta + \textrm{count}(\textrm{``确实''/}w))} \nonumber \\
& = & \frac{\theta + \textrm{count}(\textrm{``确实''/``现在''})}{\theta{|V|} + \textrm{count}(\textrm{``确实''})}
\label{eq:2-27}
\end{eqnarray}
\noindent 其中,$V$表示所有词汇的词表,$|V|$为词表中单词的个数,$w$为词典中的一个词。有时候,加法平滑方法会将$\theta$取1,这时称之为加一平滑或是拉普拉斯平滑。这种方法比较容易理解,也比较简单,因此也往往被用于对系统的快速原型中。
\parinterval 举一个例子。假设在一个英文文档中随机采样一些单词(词表大小$|V|=20$),各个单词出现的次数为:``look'': 4,``people'': 3,``am'': 2,``what'': 1,``want'': 1,``do'': 1。图\ref{fig:2-19} 给出了在平滑之前和平滑之后的概率分布。
%----------------------------------------------
\begin{figure}[htp]
\centering
\input{./Chapter2/Figures/figure-no-smoothing&smoothed-probability-distributions}
\caption{无平滑和有平滑的概率分布}
\label{fig:2-19}
\end{figure}
%-------------------------------------------
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{古德-图灵估计法}
\vspace{-0.5em}
\parinterval {\small\bfnew{古德-图灵估计法}}\index{古德-图灵估计法}(Good-Turing Estimate)\index{Good-Turing Estimate}是图灵(Alan Turing)和他的助手古德(I.J.Good)开发的,作为他们在二战期间破解德国密码机Enigma所使用的方法的一部分,在1953 年古德将其发表,这一方法也是很多平滑算法的核心,其基本思路是:把非零的$n$元语法单元的概率降低匀给一些低概率$n$元语法单元,以减小最大似然估计与真实概率之间的偏离\cite{good1953population}\cite{gale1995good}
\parinterval 假定在语料库中出现$r$次的$n$-gram有$n_r$个,特别的,出现0次的$n$-gram(即未登录词及词串)出现的次数为$n_0$个。语料库中全部词语的个数为$N$,显然
\begin{eqnarray}
N = \sum_{r=1}^{\infty}{r\,n_r}
\label{eq:2-28}
\end{eqnarray}
\parinterval 这时,出现$r$次的$n$-gram的相对频率为$r/N$,也就是不做平滑处理时的概率估计。为了解决零概率问题,对于任何一个出现$r$次的$n$-gram,古德-图灵估计法利用出现$r+1$次的$n$-gram统计量重新假设它出现$r^*$次,这里
\begin{eqnarray}
r^* = (r + 1)\frac{n_{r + 1}}{n_r}
\label{eq:2-29}
\end{eqnarray}
\parinterval 基于这个公式,就可以估计所有0次$n$-gram的频次$n_0 r^*=(r+1)n_1=n_1$。要把这个重新估计的统计数转化为概率,需要进行归一化处理:对于每个统计数为$r$的事件,其概率为
\begin{eqnarray}
\textrm{P}_r=\frac{r^*}{N}
\label{eq:2-30}
\end{eqnarray}
\vspace{5em}
\noindent 其中
\begin{eqnarray}
N & = & \sum_{r=0}^{\infty}{r^{*}n_r} \nonumber \\
& = & \sum_{r=0}^{\infty}{(r + 1)n_{r + 1}} \nonumber \\
& = & \sum_{r=1}^{\infty}{r\,n_r}
\label{eq:2-31}
\end{eqnarray}
也就是说,$N$仍然为这个整个样本分布最初的计数。样本中所有事件的概率之和为:
\begin{eqnarray}
\textrm{P}(r>0) & = & \sum_{r>0}{\textrm{P}_r} \nonumber \\
& = & 1 - \frac{n_1}{N} \nonumber \\
& < & 1
\label{eq:2-32}
\end{eqnarray}
\noindent 其中$n_1/N$就是分配给所有出现为0次事件的概率。古德-图灵方法最终通过出现1次的$n$-gram估计了出现为0次的事件概率,达到了平滑的效果。
\parinterval 这里使用一个例子来说明这个方法是如何对事件出现的可能性进行平滑的。仍然考虑在加法平滑法中统计单词的例子,根据古德-图灵方法进行修正如表\ref{tab:2-21}所示。
%------------------------------------------------------
\begin{table}[htp]{
\begin{center}
\caption{单词出现频次及古德-图灵平滑结果}
{
\begin{tabular}{l|lll}
\rule{0pt}{10pt} $r$ & $n_r$ & $r^*$ & $\textrm{P}_r$\\ \hline
\rule{0pt}{10pt} 0 & 14 & 0.21 & 0.018 \\
\rule{0pt}{10pt} 1 & 3 & 0.67 & 0.056 \\
\rule{0pt}{10pt} 2 & 1 & 3 & 0.25 \\
\rule{0pt}{10pt} 3 & 1 & 4 & 0.333 \\
\rule{0pt}{10pt} 4 & 1 & - & - \\
\end{tabular}
\label{tab:2-21}
}
\end{center}
}\end{table}
%------------------------------------------------------
\vspace{-1.5em}
\parinterval$r$很大的时候经常会出现$n_{r+1}=0$的情况,而且这时$n_r$也会有噪音存在。通常,简单的古德-图灵方法可能无法很好的处理这种复杂的情况,不过古德-图灵方法仍然是其他一些平滑方法的基础。
%----------------------------------------------------------------------------------------
% NEW SUBSUB-SECTION
%----------------------------------------------------------------------------------------
\subsubsection{Kneser-Ney平滑方法}
\parinterval Kneser-Ney平滑方法是由R.Kneser和H.Ney于1995年提出的用于计算$n$元语法概率分布的方法\cite{kneser1995improved}\cite{chen1999empirical},并被广泛认为是最有效的平滑方法。这种平滑方法改进了absolute discounting中与高阶分布相结合的低阶分布的计算方法,使不同阶分布得到充分的利用。这种算法也综合利用了其他多种平滑算法的思想。
\parinterval 首先介绍一下absolute discounting平滑算法,公式如下所示:
\begin{eqnarray}
\textrm{P}_{\textrm{AbsDiscount}}(w_i | w_{i-1}) = \frac{c(w_{i-1},w_i )-d}{c(w_{i-1})} + \lambda(w_{i-1})\textrm{P}(w)
\label{eq:2-33}
\end{eqnarray}
\noindent 其中$d$表示被裁剪的值,$\lambda$是一个正则化常数。可以看到第一项是经过减值调整过的2-gram的概率值,第二项则相当于一个带权重$\lambda$的1-gram的插值项。然而这种插值模型极易受到原始1-gram 模型的干扰。
\parinterval 假设我们使用2-gram和1-gram的插值模型预测下面句子中下划线处的词
\vspace{0.0em}
\begin{center}
I cannot see without my reading \underline{\ \ \ \ \ \ \ \ }
\end{center}
\vspace{0.0em}
\noindent 直觉上应该会猜测这个地方的词应该是``glasses'',但是在训练语料库中``Francisco''出现的频率非常高。如果在预测时仍然使用的是标准的1-gram模型,那么系统会高概率选择``Francisco''填入下划线出,这个结果明显是不合理的。当使用的是混合的插值模型时,如果``reading Francisco''这种二元语法并没有出现在语料中,就会导致1-gram对结果的影响变大,使得仍然会做出与标准1-gram模型相同的结果,犯下相同的错误。
\parinterval 观察语料中的2-gram发现,``Francisco''的前一个词仅可能是``San'',不会出现``reading''。这个分析提醒了我们,考虑前一个词的影响是有帮助的,比如仅在前一个词是``San''时,才给``Francisco''赋予一个较高的概率值。基于这种想法,改进原有的1-gram模型,创造一个新的1-gram模型$\textrm{P}_{\textrm{continuation}}$,简写为$\textrm{P}_{\textrm{cont}}$。这个模型可以通过考虑前一个词的影响评估当前词作为第二个词出现的可能性。
\parinterval 为了评估$\textrm{P}_{\textrm{cont}}$,统计使用当前词作为第二个词所出现二元语法的种类,二元语法种类越多,这个词作为第二个词出现的可能性越高,呈正比:
\begin{eqnarray}
\textrm{P}_{\textrm{cont}}(w_i) \varpropto |w_{i-1}: c(w_{i-1} w_i )>0|
\label{eq:2-34}
\end{eqnarray}
通过全部的二元语法的种类做归一化可得到评估的公式
\begin{eqnarray}
\textrm{P}_{\textrm{cont}}(w_i) = \frac{|\{ w_{i-1}:c(w_{i-1} w_i )>0 \}|}{|\{ (w_{j-1}, w_j):c(w_{j-1}w_j )>0 \}|}
\label{eq:2-35}
\end{eqnarray}
\parinterval 基于分母的变化还有另一种形式
\begin{eqnarray}
\textrm{P}_{\textrm{cont}}(w_i) = \frac{|\{ w_{i-1}:c(w_{i-1} w_i )>0 \}|}{\sum_{w^{\prime}}|\{ w_{i-1}^{\prime}:c(w_{i-1}^{\prime} w_i^{\prime} )>0 \}|}
\label{eq:2-36}
\end{eqnarray}
结合基础的absolute discounting计算公式,从而得到了Kneser-Ney平滑方法的公式
\begin{eqnarray}
\textrm{P}_{\textrm{KN}}(w_i|w_{i-1}) = \frac{\max(c(w_{i-1},w_i )-d,0)}{c(w_{i-1})}+ \lambda(w_{i-1})\textrm{P}_{\textrm{cont}}(w_i)
\label{eq:2-37}
\end{eqnarray}
\noindent 其中
\begin{eqnarray}
\lambda(w_{i-1}) = \frac{d}{c(w_{i-1})}|\{w:c(w_{i-1},w)>0\}|
\label{eq:2-38}
\end{eqnarray}
\noindent 这里$\max(\cdot)$保证了分子部分为不小0的数,原始1-gram更新成$\textrm{P}_{\textrm{cont}}$概率分布,$\lambda$是正则化项。
\parinterval 为了更具普适性,不仅局限为2-gram和1-gram的插值模型,利用递归的方式可以得到更通用的Kneser-Ney平滑公式
\begin{eqnarray}
\textrm{P}_{\textrm{KN}}(w_i|w_{i-n+1} ...w_{i-1}) & = & \frac{\max(c_{\textrm{KN}}(w_{i-n+1}...w_{i-1})-d,0)}{c_{\textrm{KN}}(w_{i-n+1}...w_{i-1})} + \nonumber \\
& & \lambda(w_{i-n+1}...w_{i-1})\textrm{P}_{\textrm{KN}}(w_i|w_{i-n+2}...w_{i-1})
\label{eq:2-39}
\end{eqnarray}
\begin{eqnarray}
\lambda(w_{i-1}) = \frac{d}{c_{\textrm{KN}}(w_{i-n+1}^{i-1})}|\{w:c_{\textrm{KN}}(w_{i-n+1}...w_{i-1}w)>0\}
\label{eq:2-40}
\end{eqnarray}
\begin{eqnarray}
c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
\textrm{count}(\cdot) & \textrm{for\ highest\ order} \\
\textrm{catcount}(\cdot) & \textrm{for\ lower\ order}
\end{array}\right.
\label{eq:2-41}
\end{eqnarray}
\noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。
\parinterval Kneser-Ney平滑是很多语言模型工具的基础\cite{wang-etal-2018-niutrans}\cite{heafield-2011-kenlm}\cite{stolcke2002srilm}。还有很多以此为基础衍生出来的算法,感兴趣的读者可以通过参考文献自行了解\cite{parsing2009speech}\cite{ney1994structuring}\cite{chen1999empirical}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\sectionnewpage
\section{搜索}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\sectionnewpage
\section{小结及深入阅读} \label{sec2:summary}
\parinterval 本章重点介绍了如何对自然语言处理问题进行统计建模,并从数据中自动学习统计模型的参数,最终使用学习到的模型对新的问题进行处理。之后,本章将这种思想应用到三个自然语言处理任务中,包括:中文分词、语言建模、句法分析,它们也和机器翻译有着紧密的联系。通过系统化的建模,可以发现:经过适当的假设和化简,统计模型可以很好的描述复杂的自然语言处理问题。相关概念和方法也会在后续章节的内容中被广泛使用。
\parinterval 由于本章重点介绍如何用统计的思想对自然语言处理任务进行建模,因此并没有对具体的问题展开深入讨论。有几方面内容,读者可以继续关注:
\begin{adjustwidth}{1em}{}
\begin{itemize}
\vspace{0.5em}
\item 在建模方面,本章介绍的三个任务均采用的是基于人工先验知识进行模型设计的思路。也就是,问题所表达的现象被``一步一步''生成出来。这是一种典型的生成式建模思想,它把要解决的问题看作一些观测结果的隐含变量(比如,句子是观测结果,分词结果是隐含在背后的变量),之后通过对隐含变量生成观测结果的过程进行建模,以达到对问题进行数学描述的目的。这类模型一般需要依赖一些独立性假设,假设的合理性对最终的性能有较大影响。相对于{\small\sffamily\bfseries{生成模型}}\index{生成模型}(Generative Model)\index{Generative Model},另一类方法是{\small\sffamily\bfseries{判别模型}}\index{判别模型}(Discriminative Model)\index{Discriminative Model},它直接描述了从隐含变量生成观测结果的过程,这样对问题的建模更加直接,同时这类模型可以更加灵活的引入不同的特征。判别模型在自然语言处理中也有广泛应用\cite{shannon1948mathematical}\cite{ng2002discriminative}。 在本书的第四章也会使用到判别式模型。
\vspace{0.5em}
\item 从现在自然语言处理的前沿看,基于端到端学习的深度学习方法在很多任务中都取得了领先的性能。但是,本章并没有涉及深度学习及相关方法,这是由于笔者认为:对问题的建模是自然语言处理的基础,对问题的本质刻画并不会因为方法的改变而改变。因此,本章的内容没有太多地陷入到更加复杂的模型和算法设计中,相反,我们希望关注对基本问题的理解和描述。不过,一些前沿方法仍可以作为参考,包括:基于条件随机场和双向长短时记忆模型的序列标注模型\cite{lafferty2001conditional}\cite{huang2015bidirectional}\cite{ma2016end}、神经语言模型\cite{bengio2003neural}\cite{mikolov2010recurrent}、神经句法分析模型\cite{chen2014fast}\cite{zhu2015long}
\vspace{0.5em}
\item 此外,本章并没有对模型的推断方法进行深入介绍。比如,对于一个句子如何有效的找到概率最大的分词结果?显然,简单枚举是不可行的。对于这类问题比较简单的解决方法是使用动态规划\cite{huang2008advanced}。如果使用动态规划的条件不满足,可以考虑使用更加复杂的搜索策略,并配合一定剪枝方法。实际上,无论是$n$-gram语言模型还是简单的上下文无关文法都有高效的推断方法。比如,$n$-gram语言模型可以被视为概率有限状态自动机,因此可以直接使用成熟的自动机工具。对于更复杂的句法分析问题,可以考虑使用移进-规约方法来解决推断问题\cite{aho1972theory}
\vspace{0.5em}
\end{itemize}
\end{adjustwidth}
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-3.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 3
%----------------------------------------------------------------------------------------
\chapter{词法分析和语法分析}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-4.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 4
%----------------------------------------------------------------------------------------
\chapter{翻译质量评价}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-5.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 5
%----------------------------------------------------------------------------------------
\chapter{基于词的机器翻译建模}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-6.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 6
%----------------------------------------------------------------------------------------
\chapter{基于繁衍率的模型}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-7.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 7
%----------------------------------------------------------------------------------------
\chapter{基于短语的模型}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-8.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 8
%----------------------------------------------------------------------------------------
\chapter{基于句法的模型}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS
%----------------------------------------------------------------------------------------
\part{机器翻译基础}
\renewcommand\figurename{}%将figure改为图
\renewcommand\tablename{}%将figure改为图
\chapterimage{fig-NEU-9.jpg} % Chapter heading image
%----------------------------------------------------------------------------------------
% CHAPTER 9
%----------------------------------------------------------------------------------------
\chapter{人工神经网络和神经语言建模}
%----------------------------------------------------------------------------------------
% NEW SECTION
%----------------------------------------------------------------------------------------
\section{}
\ No newline at end of file
......@@ -709,6 +709,143 @@
year ={2020},
note ={\url{https://nndl.github.io/}}
}
@article{tripathi2010approaches,
title="Approaches to machine translation",
author="Sneha {Tripathi} and Juran Krishna {Sarkhel}",
journal="Annals of Library and Information Studies",
volume="57",
number="4",
pages="388--393",
notes="Sourced from Microsoft Academic - https://academic.microsoft.com/paper/2340598284",
year="2010"
}
@article{jurafsky2000speech,
title="Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition",
author="Daniel {Jurafsky} and James H. {Martin}",
journal="",
notes="Sourced from Microsoft Academic - https://academic.microsoft.com/paper/1579838312",
year="2000"
}
@book{冯志伟2004机器翻译研究,
title={机器翻译研究},
author={冯志伟},
publisher={中国对外翻译出版公司},
year={2004},
}
@article{DBLP:journals/bstj/Shannon48,
author = {Claude E. Shannon},
title = {A mathematical theory of communication},
journal = {Bell Syst. Tech. J.},
volume = {27},
number = {3},
pages = {379--423},
year = {1948},
url = {https://doi.org/10.1002/j.1538-7305.1948.tb01338.x},
doi = {10.1002/j.1538-7305.1948.tb01338.x},
timestamp = {Sat, 30 May 2020 20:01:09 +0200},
biburl = {https://dblp.org/rec/journals/bstj/Shannon48.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/coling/BrownCPPJLMR90,
author = {Peter F. Brown and
John Cocke and
Stephen Della Pietra and
Vincent J. Della Pietra and
Frederick Jelinek and
John D. Lafferty and
Robert L. Mercer and
Paul S. Roossin},
title = {A Statistical Approach to Machine Translation},
journal = {Comput. Linguistics},
volume = {16},
number = {2},
pages = {79--85},
year = {1990},
timestamp = {Mon, 11 May 2020 15:46:08 +0200},
biburl = {https://dblp.org/rec/journals/coling/BrownCPPJLMR90.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{DBLP:journals/coling/BrownPPM94,
author = {Peter F. Brown and
Stephen Della Pietra and
Vincent J. Della Pietra and
Robert L. Mercer},
title = {The Mathematics of Statistical Machine Translation: Parameter Estimation},
journal = {Comput. Linguistics},
volume = {19},
number = {2},
pages = {263--311},
year = {1993},
timestamp = {Mon, 11 May 2020 15:46:10 +0200},
biburl = {https://dblp.org/rec/journals/coling/BrownPPM94.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@inproceedings{DBLP:conf/coling/SatoN90,
author = {Satoshi Sato and
Makoto Nagao},
title = {Toward Memory-based Translation},
booktitle = {13th International Conference on Computational Linguistics, {COLING}
1990, University of Helsinki, Finland, August 20-25, 1990},
pages = {247--252},
year = {1990},
url = {https://www.aclweb.org/anthology/C90-3044/},
timestamp = {Mon, 16 Sep 2019 17:08:53 +0200},
biburl = {https://dblp.org/rec/conf/coling/SatoN90.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
@article{李生1994机器词典的信息表示及在汉英机器翻译中的实现,
title={机器词典的信息表示及在汉英机器翻译中的实现},
author={李生 and 赵铁军},
journal={中文信息学报},
volume={8},
number={1},
pages={45-55},
year={1994},
}
@inproceedings{黄昌宁2006由字构词——中文分词新方法,
title={由字构词——中文分词新方法},
author={黄昌宁 and 赵海},
booktitle={中文信息处理前沿进展——中国中文信息学会二十五周年学术会议论文集},
year={2006},
}
@article{冯志伟1995论歧义结构的潜在性,
title={论歧义结构的潜在性},
author={冯志伟},
journal={中文信息学报},
volume={9},
number={4},
pages={14-24},
year={1995},
}
@article{王宝库1991机器翻译系统中一种规则描述语言,
title={机器翻译系统中一种规则描述语言(CTRDL)},
author={王宝库 and 张中义 and 姚天顺},
journal={中文信息学报},
volume={5},
number={4},
year={1991},
}
@article{唐泓英1995基于搭配词典的词汇语义驱动算法,
title={基于搭配词典的词汇语义驱动算法},
author={唐泓英 and 姚天顺},
journal={软件学报},
volume={006},
number={a01},
pages={78-85},
year={1995},
}
%%%%% chapter 1------------------------------------------------------
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
......
......@@ -138,6 +138,16 @@
\include{Chapter5/chapter5}
\include{Chapter6/chapter6}
\include{Chapter7/chapter7}
\include{Chapter8/chapter8}
\include{Chapter9/chapter9}
\include{Chapter10/chapter10}
\include{Chapter11/chapter11}
\include{Chapter12/chapter12}
\include{Chapter13/chapter13}
\include{Chapter14/chapter14}
\include{Chapter15/chapter15}
\include{Chapter16/chapter16}
\include{Chapter17/chapter17}
\include{ChapterAppend/chapterappend}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论