Commit 7fce7e1a by 孟霞

合并分支 'master' 到 'mengxia'

Master

查看合并请求 !194
parents f7aa1b25 57dc6137
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
...@@ -939,9 +947,9 @@ I cannot see without my reading \underline{\ \ \ \ \ \ \ \ } ...@@ -939,9 +947,9 @@ I cannot see without my reading \underline{\ \ \ \ \ \ \ \ }
\end{eqnarray} \end{eqnarray}
\begin{eqnarray} \begin{eqnarray}
c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll} c_{\textrm{KN}}(\cdot) = \left\{\begin{array}{ll}
\textrm{count}(\cdot) & \textrm{for\ highest\ order} \\ \textrm{count}(\cdot) & \textrm{for\ highest\ order} \\
\textrm{catcount}(\cdot) & \textrm{for\ lower\ order} \textrm{catcount}(\cdot) & \textrm{for\ lower\ order}
\end{array}\right. \end{array}\right.
\label{eq:2-41} \label{eq:2-41}
\end{eqnarray} \end{eqnarray}
\noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。 \noindent 其中catcount$(\cdot)$表示的是基于某个单个词作为第$n$个词的$n$-gram的种类数目。
......
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
\begin{scope}[scale=0.85] \begin{scope}[scale=0.85]
\node[srcnode] (c1) at (0,0) {\normalsize{\textbf{Function} CKY-Algorithm($\textbf{s},G$)}}; \node[srcnode] (c1) at (0,0) {\normalsize{\textbf{Function} CKY-Algorithm($\textbf{s},G$)}};
\node[srcnode,anchor=north west] (c21) at ([xshift=1.5em,yshift=0.4em]c1.south west) {\normalsize{\textbf{fore} $j=0$ to $ J - 1$}}; \node[srcnode,anchor=north west] (c21) at ([xshift=1.5em,yshift=0.4em]c1.south west) {\normalsize{\textbf{for} $j=0$ to $ J - 1$}};
\node[srcnode,anchor=north west] (c22) at ([xshift=1.5em,yshift=0.4em]c21.south west) {\normalsize{$span[j,j+1 ]$.Add($A \to a \in G$)}}; \node[srcnode,anchor=north west] (c22) at ([xshift=1.5em,yshift=0.4em]c21.south west) {\normalsize{$span[j,j+1 ]$.Add($A \to a \in G$)}};
\node[srcnode,anchor=north west] (c3) at ([xshift=-1.5em,yshift=0.4em]c22.south west) {\normalsize{\textbf{for} $l$ = 1 to $J$}}; \node[srcnode,anchor=north west] (c3) at ([xshift=-1.5em,yshift=0.4em]c22.south west) {\normalsize{\textbf{for} $l$ = 1 to $J$}};
\node[srcnode,anchor=west] (c31) at ([xshift=6em]c3.east) {\normalsize{// length of span}}; \node[srcnode,anchor=west] (c31) at ([xshift=6em]c3.east) {\normalsize{// length of span}};
......
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS configurations % CONFIGURATIONS configurations
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\renewcommand\figurename{}%将figure改为图 \renewcommand\figurename{}%将figure改为图
...@@ -780,7 +788,7 @@ dr = start_i-end_{i-1}-1 ...@@ -780,7 +788,7 @@ dr = start_i-end_{i-1}-1
\subsubsection{翻译候选匹配} \subsubsection{翻译候选匹配}
\parinterval 在解码时,首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语,每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}(Translation Candidate)\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语,之后在短语表中找到相应的翻译即可。比如,图\ref{fig:4-27}展示了句子``桌子\ \ \ 一个\ 苹果''的翻译候选匹配结果。可以看到,不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如,``upon the table''是短语``桌子 上 有''的翻译候选,即对应源语言跨度[0,3]。\\ \\ \\ \parinterval 在解码时,首先要知道每个源语言短语可能的译文都是什么。对于一个源语言短语,每个可能的译文也被称作{\small\bfnew{翻译候选}}\index{翻译候选}(Translation Candidate)\index{Translation Candidate}。实现翻译候选的匹配很简单。只需要遍历输入的源语言句子中所有可能的短语,之后在短语表中找到相应的翻译即可。比如,图\ref{fig:4-27}展示了句子``桌子\ \ \ 一个\ 苹果''的翻译候选匹配结果。可以看到,不同的短语会对应若干翻译候选。这些翻译候选会保存在所对应的跨度中。比如,``upon the table''是短语``桌子 上 有''的翻译候选,即对应源语言跨度[0,3]。\\ \\ \\
%---------------------------------------------- %----------------------------------------------
\begin{figure}[htp] \begin{figure}[htp]
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
\node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$}; \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
\addtocounter{mycount1}{1}; \addtocounter{mycount1}{1};
} }
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$}; \node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}$};
\node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(a)}}; \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(a)}};
\end{scope} \end{scope}
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
\node [fill=green!20,inner sep=0pt,minimum height=0.48cm,minimum width=0.48cm] at (\x,\y) {$1$}; \node [fill=green!20,inner sep=0pt,minimum height=0.48cm,minimum width=0.48cm] at (\x,\y) {$1$};
\addtocounter{mycount1}{1}; \addtocounter{mycount1}{1};
} }
\node [anchor=south] (varlabel) at (0,0.1) {$\textbf{b}$}; \node [anchor=south] (varlabel) at (0,0.1) {$\mathbf{b}$};
\node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(b)}}; \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(b)}};
\end{scope} \end{scope}
...@@ -34,7 +34,7 @@ ...@@ -34,7 +34,7 @@
\node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$}; \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
\addtocounter{mycount1}{1}; \addtocounter{mycount1}{1};
} }
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$}; \node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}$};
\end{scope} \end{scope}
\begin{scope}[yshift=-1in,xshift=1.5in] \begin{scope}[yshift=-1in,xshift=1.5in]
\setcounter{mycount1}{1} \setcounter{mycount1}{1}
...@@ -49,8 +49,8 @@ ...@@ -49,8 +49,8 @@
\node [fill=purple!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$1$}; \node [fill=purple!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$1$};
\addtocounter{mycount1}{1}; \addtocounter{mycount1}{1};
} }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{+}$}}; \node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\mathbf{+}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{b}$}; \node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{b}$};
\node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(c)}}; \node [anchor=north] (labelc) at (0,-0.7) {\footnotesize{(c)}};
\end{scope} \end{scope}
\begin{scope}[yshift=-1in,xshift=3in] \begin{scope}[yshift=-1in,xshift=3in]
...@@ -61,8 +61,8 @@ ...@@ -61,8 +61,8 @@
\node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$}; \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
\addtocounter{mycount1}{1}; \addtocounter{mycount1}{1};
} }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{=}$}}; \node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\mathbf{=}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s+b}$}; \node [anchor=south] (varlabel) at (0,0.6) {$\mathbf{s}+\mathbf{b}$};
\end{scope} \end{scope}
......
...@@ -59,8 +59,8 @@ ...@@ -59,8 +59,8 @@
\draw [thick,->] (layer3.north) -- (y.south); \draw [thick,->] (layer3.north) -- (y.south);
\node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}}; \node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
\node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*4}}; \node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*4}};
\node [anchor=south west,align=left,inner sep=2pt] (l1shape) at (layer1.north) {\tiny{shape: 3*4*3}}; \node [anchor=south west,align=left,inner sep=2pt] (l1shape) at ([xshift=0.3em]layer1.north) {\tiny{shape: 3*4*3}};
\node [anchor=south west,align=left,inner sep=2pt] (l2shape) at (layer2.north) {\tiny{shape: 3*4*6}}; \node [anchor=south west,align=left,inner sep=2pt] (l2shape) at ([xshift=0.3em]layer2.north) {\tiny{shape: 3*4*6}};
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
\end{tcolorbox} \end{tcolorbox}
......
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{XTensor tensor;} \hspace{12em} \= // 声明张量tensor \\ \texttt{XTensor tensor;} \hspace{14em} \= // 声明张量tensor \\
\texttt{int sizes[6] = \{2,3,4,2,3,4\};} \> // 张量的形状为2*3*4*2*3*4 \\ \texttt{int sizes[6] = \{2,3,4,2,3,4\};} \> // 张量的形状为2*3*4*2*3*4 \\
\texttt{InitTensor(\&tensor, 6, sizes, X\_FLOAT);} \> // 定义形状为sizes的6阶张量 \texttt{InitTensor(\&tensor, 6, sizes, X\_FLOAT);} \> // 定义形状为sizes的6阶张量
\end{tabbing} \end{tabbing}
...@@ -12,11 +12,11 @@ ...@@ -12,11 +12,11 @@
\end{tcolorbox} \end{tcolorbox}
\hspace{0.1in} \scriptsize{(a) NiuTensor定义张量程序} \hspace{0.1in} \scriptsize{(a) NiuTensor定义张量程序}
\\ \\
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{11.5em} \= // 声明张量tensor \\ \texttt{XTensor a, b, c;} \hspace{13.5em} \= // 声明张量tensor \\
\texttt{InitTensor1D(\&a, 10, X\_INT);} \> // 10维的整数型向量\\ \texttt{InitTensor1D(\&a, 10, X\_INT);} \> // 10维的整数型向量\\
\texttt{InitTensor1D(\&b, 10);} \> // 10维的向量,缺省类型(浮点)\\ \texttt{InitTensor1D(\&b, 10);} \> // 10维的向量,缺省类型(浮点)\\
\texttt{InitTensor4D(\&c, 10, 20, 30, 40);} \> // 10*20*30*40的4阶张量(浮点) \texttt{InitTensor4D(\&c, 10, 20, 30, 40);} \> // 10*20*30*40的4阶张量(浮点)
...@@ -26,11 +26,11 @@ ...@@ -26,11 +26,11 @@
\end{tcolorbox} \end{tcolorbox}
\hspace{0.1in} \scriptsize{(b) 定义张量的简便方式程序} \hspace{0.1in} \scriptsize{(b) 定义张量的简便方式程序}
\\ \\
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{XTensor tensorGPU;} \hspace{10.5em} \= // 声明张量tensor \\ \texttt{XTensor tensorGPU;} \hspace{12.5em} \= // 声明张量tensor \\
\texttt{InitTensor2D(\&tensorGPU, 10, 20,} $\backslash$ \> // 在编号为0的GPU上定义张量 \\ \texttt{InitTensor2D(\&tensorGPU, 10, 20,} $\backslash$ \> // 在编号为0的GPU上定义张量 \\
\hspace{6.7em} \texttt{X\_FLOAT, 0);} \hspace{6.7em} \texttt{X\_FLOAT, 0);}
\end{tabbing} \end{tabbing}
......
%------------------------------------------------------------------------------------------------------------ %------------------------------------------------------------------------------------------------------------
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{\#include "source/tensor/XTensor.h"} \hspace{4em} \= // 引用XTensor定义的头文件 \\ \texttt{\#include "source/tensor/XTensor.h"} \hspace{6em} \= // 引用XTensor定义的头文件 \\
\texttt{using namespace nts;} \> // 引用nts命名空间 \\ \texttt{using namespace nts;} \> // 引用nts命名空间 \\
\ \\ \ \\
......
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{XTensor a, b, c, d, e;} \hspace{7em} \= // 声明张量tensor \\ \texttt{XTensor a, b, c, d, e;} \hspace{9em} \= // 声明张量tensor \\
\texttt{InitTensor3D(\&a, 2, 3, 4);} \> // a为2*3*4的3阶张量 \\ \texttt{InitTensor3D(\&a, 2, 3, 4);} \> // a为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&b, 2, 3, 4);} \> // b为2*3*4的3阶张量 \\ \texttt{InitTensor3D(\&b, 2, 3, 4);} \> // b为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&c, 2, 3, 4);} \> // c为2*3*4的3阶张量 \\ \texttt{InitTensor3D(\&c, 2, 3, 4);} \> // c为2*3*4的3阶张量 \\
...@@ -19,11 +19,11 @@ ...@@ -19,11 +19,11 @@
\end{tcolorbox} \end{tcolorbox}
\hspace{0.1in} \scriptsize{(a) 张量进行1阶运算} \hspace{0.1in} \scriptsize{(a) 张量进行1阶运算}
\\ \\
\begin{tcolorbox}[enhanced,width=11cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white] \begin{tcolorbox}[enhanced,width=12cm,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft} \begin{flushleft}
{\scriptsize {\scriptsize
\begin{tabbing} \begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{10.0em} \= // 声明张量tensor \\ \texttt{XTensor a, b, c;} \hspace{12.0em} \= // 声明张量tensor \\
\texttt{InitTensor4D(\&a, 2, 2, 3, 4);} \> // a为2*2*3*4的4阶张量 \\ \texttt{InitTensor4D(\&a, 2, 2, 3, 4);} \> // a为2*2*3*4的4阶张量 \\
\texttt{InitTensor2D(\&b, 4, 5);} \> // b为4*5的矩阵 \\ \texttt{InitTensor2D(\&b, 4, 5);} \> // b为4*5的矩阵 \\
\texttt{a.SetDataRand();} \> // 随机初始化a \\ \texttt{a.SetDataRand();} \> // 随机初始化a \\
......
% !Mode:: "TeX:UTF-8" % !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
\part{神经机器翻译} \part{神经机器翻译}
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
...@@ -393,7 +401,7 @@ ...@@ -393,7 +401,7 @@
\subsubsection{线性映射} \subsubsection{线性映射}
\parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}( Linear Mapping)\index{Linear Mapping}{\small\sffamily\bfseries{线性变换}}\index{线性变换}(Linear Transformation)\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$,且该映射函数保持加法运算和数量乘法运算,即对于空间V中任何两个向量$ \mathbf u $$ \mathbf v $以及任何标量$ c $,有: \parinterval {\small\sffamily\bfseries{线性映射}}\index{线性映射}( Linear Mapping)\index{Linear Mapping}{\small\sffamily\bfseries{线性变换}}\index{线性变换}(Linear Transformation)\index{Linear Transformation}是从一个向量空间V到另一个向量空间W的映射函数$ f:v\rightarrow w$,且该映射函数保持加法运算和数量乘法运算,即对于空间V中任何两个向量$ \mathbf u $ $ \mathbf v $以及任何标量$ c $,有:
\begin{eqnarray} \begin{eqnarray}
f(\mathbf u+\mathbf v)&=&f(\mathbf u)+f(\mathbf v)\label{eq:5-9}\\ f(\mathbf u+\mathbf v)&=&f(\mathbf u)+f(\mathbf v)\label{eq:5-9}\\
f(c\mathbf v)&=&cf(\mathbf v) f(c\mathbf v)&=&cf(\mathbf v)
...@@ -499,7 +507,7 @@ l_p(\mathbf x) & = & {\Vert{\mathbf x}\Vert}_p \nonumber \\ ...@@ -499,7 +507,7 @@ l_p(\mathbf x) & = & {\Vert{\mathbf x}\Vert}_p \nonumber \\
\end{figure} \end{figure}
%------------------------------------------- %-------------------------------------------
\parinterval 同样,人工神经元是人工神经网络的基本单元。在人们的想象中,人工神经元应该与生物神经元类似。但事实上,二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元,其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见,一个神经元主要由$ \mathbf x $$ \mathbf w $$ b $$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $的实数向量,在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵,其中的每一个元素都对应着一个输入和一个输出,代表着``某输入对某输出的贡献程度'',通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}(weight)\index{weight}$ b $被称作偏置,是一个实数。$ f $被称作激活函数,其本质是一个非线性函数。可见,一个人工神经元的功能是将输入向量与权重矩阵右乘(做内积)后,加上偏置量,经过一个非线性激活函数得到一个标量结果。 \parinterval 同样,人工神经元是人工神经网络的基本单元。在人们的想象中,人工神经元应该与生物神经元类似。但事实上,二者在形态上是有明显差别的。如图\ref{fig:5-4} 是一个典型的人工神经元,其本质是一个形似$ y=f(\mathbf x\cdot \mathbf w+b) $的函数。显而易见,一个神经元主要由$ \mathbf x $$ \mathbf w $$ b $$ f $四个部分构成。其中$ \mathbf x $是一个形如$ (x_0,x_1,\dots,x_n) $ 的实数向量,在一个神经元中担任``输入''的角色。$ \mathbf w $是一个权重矩阵,其中的每一个元素都对应着一个输入和一个输出,代表着``某输入对某输出的贡献程度'',通常也被理解为神经元连接的{\small\sffamily\bfseries{权重}}\index{权重}(weight)\index{weight}$ b $被称作偏置,是一个实数。$ f $被称作激活函数,其本质是一个非线性函数。可见,一个人工神经元的功能是将输入向量与权重矩阵右乘(做内积)后,加上偏置量,经过一个非线性激活函数得到一个标量结果。
%---------------------------------------------- %----------------------------------------------
\begin{figure}[htp] \begin{figure}[htp]
...@@ -1249,7 +1257,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c ...@@ -1249,7 +1257,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c
\parinterval 下面用几个实例来说明搭建神经网络的过程。搭建神经网络的过程本质上就是定义前向计算的过程。首先构造一个单层神经网络。如图\ref{fig:5-39}(a)所示,简单的定义输入、权重和偏置后,定义激活函数为Sigmoid函数,输入$ \mathbf x $经过线性变换和激活函数,得到输出$ \mathbf y $ \parinterval 下面用几个实例来说明搭建神经网络的过程。搭建神经网络的过程本质上就是定义前向计算的过程。首先构造一个单层神经网络。如图\ref{fig:5-39}(a)所示,简单的定义输入、权重和偏置后,定义激活函数为Sigmoid函数,输入$ \mathbf x $经过线性变换和激活函数,得到输出$ \mathbf y $
\parinterval\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中,$ \mathbf x $作为输入,$ \mathbf h1 $作为输出,其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中,$ \mathbf h1 $作为输入,$ \mathbf h2 $作为输出,其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中,$ \mathbf h2 $作为输入,$ \mathbf y $作为输出,其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $$ \mathbf y $也会作为整个神经网络的输出。 \parinterval\ref{fig:5-39}(b)是一个构造三层神经网络的程序示例。在第一层中,$ \mathbf x $作为输入,$ \mathbf h1 $作为输出,其中$ \mathbf h1={\rm{Sigmoid}}(\mathbf x\cdot \mathbf w1+\mathbf b1) $。在第二层中,$ \mathbf h1 $作为输入,$ \mathbf h2 $作为输出,其中$ \mathbf h2={\rm{Tanh}}(\mathbf h1\cdot \mathbf w2) $。在第三层中,$ \mathbf h2 $作为输入,$ \mathbf y $ 作为输出,其中$ \mathbf y={\rm{ReLU}}(\mathbf h2\cdot \mathbf w3) $$ \mathbf y $也会作为整个神经网络的输出。
%---------------------------------------------- %----------------------------------------------
\begin{figure}[htp] \begin{figure}[htp]
...@@ -1380,7 +1388,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c ...@@ -1380,7 +1388,7 @@ y&=&{\rm{Sigmoid}}({\rm{Tanh}}(\mathbf x\cdot \mathbf w^{[1]}+\mathbf b^{[1]})\c
\parinterval 从优化的角度看,梯度下降是一种典型的 {\small\bfnew{基于梯度的方法}}\index{基于梯度的方法}(Gradient-based Method)\index{Gradient-based Method},属于基于一阶导数的方法。其他类似的方法还有牛顿法、共轭方向法、拟牛顿法等。在具体实现时,公式\ref{eq:5-29}可以有以下不同的形式。\\ \parinterval 从优化的角度看,梯度下降是一种典型的 {\small\bfnew{基于梯度的方法}}\index{基于梯度的方法}(Gradient-based Method)\index{Gradient-based Method},属于基于一阶导数的方法。其他类似的方法还有牛顿法、共轭方向法、拟牛顿法等。在具体实现时,公式\ref{eq:5-29}可以有以下不同的形式。\\
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1398,7 +1406,7 @@ J(\mathbf w)&=&\frac{1}{n}\sum_{i=1}^{n}{L(\mathbf x_i,\mathbf {\widetilde y}_i; ...@@ -1398,7 +1406,7 @@ J(\mathbf w)&=&\frac{1}{n}\sum_{i=1}^{n}{L(\mathbf x_i,\mathbf {\widetilde y}_i;
\parinterval 不过,这种方法的缺点也十分明显,因为要在全部训练数据上最小化损失,每一次参数更新都需要计算在所有样本上的损失。在使用海量数据进行训练的情况下,这种计算是非常消耗时间的。当训练数据规模很大时,很少使用这种方法。 \parinterval 不过,这种方法的缺点也十分明显,因为要在全部训练数据上最小化损失,每一次参数更新都需要计算在所有样本上的损失。在使用海量数据进行训练的情况下,这种计算是非常消耗时间的。当训练数据规模很大时,很少使用这种方法。
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1416,7 +1424,7 @@ J(\mathbf w)&=&L(\mathbf x_i,\mathbf {\widetilde y}_i;\mathbf w) ...@@ -1416,7 +1424,7 @@ J(\mathbf w)&=&L(\mathbf x_i,\mathbf {\widetilde y}_i;\mathbf w)
\parinterval 因为随机梯度下降算法每次优化的只是某一个样本上的损失,所以它的问题也非常明显:单个样本上的损失无法代表在全部样本上的损失,因此参数更新的效率低,方法收敛速度极慢。即使在目标函数为强凸函数的情况下,SGD仍旧无法做到线性收敛。 \parinterval 因为随机梯度下降算法每次优化的只是某一个样本上的损失,所以它的问题也非常明显:单个样本上的损失无法代表在全部样本上的损失,因此参数更新的效率低,方法收敛速度极慢。即使在目标函数为强凸函数的情况下,SGD仍旧无法做到线性收敛。
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1440,7 +1448,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y ...@@ -1440,7 +1448,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
\parinterval 梯度下降算法的一个核心是要得到目标函数相对于参数的梯度。下面将介绍三种常见的求梯度方法:数值微分、符号微分和自动微分,深度学习实现过程中多是采用自动微分方法计算梯度\cite{baydin2017automatic} \parinterval 梯度下降算法的一个核心是要得到目标函数相对于参数的梯度。下面将介绍三种常见的求梯度方法:数值微分、符号微分和自动微分,深度学习实现过程中多是采用自动微分方法计算梯度\cite{baydin2017automatic}
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1464,7 +1472,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y ...@@ -1464,7 +1472,7 @@ J(\mathbf w)&=&\frac{1}{m}\sum_{i=j}^{j+m-1}{L(\mathbf x_i,\mathbf {\widetilde y
\parinterval 尽管数值微分不适用于大模型中的梯度求解,但是由于其非常简单,因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候(详见\ref{sec:5.4.6}节),可以检验求导是否正确(Gradient Check),这个过程就是利用数值微分实现的。\\ \\ \parinterval 尽管数值微分不适用于大模型中的梯度求解,但是由于其非常简单,因此经常被用于检验其他梯度计算方法的正确性。比如在实现反向传播的时候(详见\ref{sec:5.4.6}节),可以检验求导是否正确(Gradient Check),这个过程就是利用数值微分实现的。\\ \\
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\noindent {\small\sffamily\bfseries{ 符号微分\index{符号微分}(Symbolic Differentiation)\index{Symbolic Differentiation}}} \noindent {\small\sffamily\bfseries{ 符号微分\index{符号微分}(Symbolic Differentiation)\index{Symbolic Differentiation}}}
...@@ -1499,7 +1507,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\ ...@@ -1499,7 +1507,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
%-------------------------------------------------------------------- %--------------------------------------------------------------------
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1570,7 +1578,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\ ...@@ -1570,7 +1578,7 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
%------------------------------------------- %-------------------------------------------
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1589,7 +1597,7 @@ w_{t+1}&=&w_t-\alpha v_t ...@@ -1589,7 +1597,7 @@ w_{t+1}&=&w_t-\alpha v_t
\parinterval 这里的``梯度''不再只是现在的损失函数的梯度,而是之前的梯度的加权和。在原始的梯度下降算法中,如果在某个参数状态下,梯度方向变化特别大,甚至与上一次参数更新中梯度方向成90度夹角,下一次参数更新中梯度方向可能又是一次90度的改变,这时参数优化路径将会成``锯齿''状(如图\ref{fig:5-46}所示),优化效率极慢。而Momentum梯度下降算法不会让梯度发生90度的变化,而是让梯度慢慢发生改变:如果当前的梯度方向与之前的梯度方向相同,在原梯度方向上加速更新参数;如果当前的梯度方向与之前的梯度方向相反,并不会产生一个急转弯,而是尽量把优化路径平滑地进行改变。这样做的优点也非常明显,一方面杜绝了``锯齿''状优化路径的出现,另一方面将优化幅度变得更加平滑,不会导致频频跳过最优点。 \parinterval 这里的``梯度''不再只是现在的损失函数的梯度,而是之前的梯度的加权和。在原始的梯度下降算法中,如果在某个参数状态下,梯度方向变化特别大,甚至与上一次参数更新中梯度方向成90度夹角,下一次参数更新中梯度方向可能又是一次90度的改变,这时参数优化路径将会成``锯齿''状(如图\ref{fig:5-46}所示),优化效率极慢。而Momentum梯度下降算法不会让梯度发生90度的变化,而是让梯度慢慢发生改变:如果当前的梯度方向与之前的梯度方向相同,在原梯度方向上加速更新参数;如果当前的梯度方向与之前的梯度方向相反,并不会产生一个急转弯,而是尽量把优化路径平滑地进行改变。这样做的优点也非常明显,一方面杜绝了``锯齿''状优化路径的出现,另一方面将优化幅度变得更加平滑,不会导致频频跳过最优点。
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1608,7 +1616,7 @@ w_{t+1}&=&w_t-\eta \frac{1}{\sqrt{z_t}}\cdot \frac{\partial L}{\partial w_t} ...@@ -1608,7 +1616,7 @@ w_{t+1}&=&w_t-\eta \frac{1}{\sqrt{z_t}}\cdot \frac{\partial L}{\partial w_t}
\parinterval 这里新出现了变量$ z $,它保存了以前的所有梯度值的平方和,在更新参数时,通过乘以$ \frac{1}{\sqrt{z_t}} $ ,就可以调整学习的尺度。这意味着,变动较大(被大幅度更新)的参数的学习率将变小。也就是说,可以按参数的元素进行学习率衰减,使变动大的参数的学习率逐渐减小。 \parinterval 这里新出现了变量$ z $,它保存了以前的所有梯度值的平方和,在更新参数时,通过乘以$ \frac{1}{\sqrt{z_t}} $ ,就可以调整学习的尺度。这意味着,变动较大(被大幅度更新)的参数的学习率将变小。也就是说,可以按参数的元素进行学习率衰减,使变动大的参数的学习率逐渐减小。
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
...@@ -1629,7 +1637,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}}\cdot \frac{\partial L}{\partial w ...@@ -1629,7 +1637,7 @@ w_{t+1}&=&w_t-\frac{\eta}{\sqrt{z_t+\epsilon}}\cdot \frac{\partial L}{\partial w
\parinterval RMSProp与AdaGrad相比,学习率的分母部分(即两种梯度下降算法迭代公式中的$ z $)的计算由累积方式变成了指数衰减移动平均。于是,每个参数的学习率并不是呈衰减趋势,而是既可以变小也可以变大,从而避免AdaGrad算法中学习率不断单调下降以至于过早衰减的缺点。 \parinterval RMSProp与AdaGrad相比,学习率的分母部分(即两种梯度下降算法迭代公式中的$ z $)的计算由累积方式变成了指数衰减移动平均。于是,每个参数的学习率并不是呈衰减趋势,而是既可以变小也可以变大,从而避免AdaGrad算法中学习率不断单调下降以至于过早衰减的缺点。
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% %
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
\vspace{0.5em} \vspace{0.5em}
......
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
...@@ -443,7 +451,7 @@ y = f(x) ...@@ -443,7 +451,7 @@ y = f(x)
\parinterval 正则化的一种实现是在训练目标中引入一个正则项。在神经机器翻译中,引入正则项的训练目标为: \parinterval 正则化的一种实现是在训练目标中引入一个正则项。在神经机器翻译中,引入正则项的训练目标为:
\begin{eqnarray} \begin{eqnarray}
\hat{\mathbf{w}}=\argmax_{\mathbf{w}}L(\mathbf{w}) + \lambda R(\mathbf{w}) \widehat{\mathbf{w}}=\argmax_{\mathbf{w}}L(\mathbf{w}) + \lambda R(\mathbf{w})
\label{eq:7-2} \label{eq:7-2}
\end{eqnarray} \end{eqnarray}
...@@ -1854,11 +1862,13 @@ L_{\textrm{seq}} = - \textrm{logP}_{\textrm{s}}(\hat{\mathbf{y}} | \mathbf{x}) ...@@ -1854,11 +1862,13 @@ L_{\textrm{seq}} = - \textrm{logP}_{\textrm{s}}(\hat{\mathbf{y}} | \mathbf{x})
\vspace{0.5em} \vspace{0.5em}
\item 无指导机器翻译。无指导机器翻译由于其不需要双语语料即可训练翻译模型的特性,在稀缺资源机器翻译的场景中有非常大的潜力而得到广泛的关注。目前无指导机器翻译主要有两种范式:第一种先得到词典的翻译,然后得到短语表的翻译和相应的统计机器翻译系统,最后使用统计机器翻译系统生成伪双语平行语料训练神经机器翻译系统\cite{DBLP:conf/acl/ArtetxeLA19};第二种是先预训练语言模型来初始化神经机器翻译系统的编码器和解码器,然后使用翻译中回译以及降噪自编码器来训练神经机器翻译系统\cite{lample2019cross}。尽管目前无指导机器翻译在富资源的语种上取得了很大进展,但是离实际应用还有很远距离。比如,目前无指导系统都依赖于大量单语数据,而实际上稀缺资源的语种不但双语语料少,单语语料也少;此外,这些系统还无法在远距离如中英这些字母表重合少,需要大范围调序的语种对上取得可接受的结果;使用大量单语训练无指导系统还面临数据来自于不同领域的问题\cite{DBLP:journals/corr/abs-2004-05516}。设计更鲁棒,使用单语数据更高效的无指导机器翻译方法乃至新范式会是未来的趋势。 \item 无指导机器翻译。无指导机器翻译由于其不需要双语语料即可训练翻译模型的特性,在稀缺资源机器翻译的场景中有非常大的潜力而得到广泛的关注。目前无指导机器翻译主要有两种范式:第一种先得到词典的翻译,然后得到短语表的翻译和相应的统计机器翻译系统,最后使用统计机器翻译系统生成伪双语平行语料训练神经机器翻译系统\cite{DBLP:conf/acl/ArtetxeLA19};第二种是先预训练语言模型来初始化神经机器翻译系统的编码器和解码器,然后使用翻译中回译以及降噪自编码器来训练神经机器翻译系统\cite{lample2019cross}。尽管目前无指导机器翻译在富资源的语种上取得了很大进展,但是离实际应用还有很远距离。比如,目前无指导系统都依赖于大量单语数据,而实际上稀缺资源的语种不但双语语料少,单语语料也少;此外,这些系统还无法在远距离如中英这些字母表重合少,需要大范围调序的语种对上取得可接受的结果;使用大量单语训练无指导系统还面临数据来自于不同领域的问题\cite{DBLP:journals/corr/abs-2004-05516}。设计更鲁棒,使用单语数据更高效的无指导机器翻译方法乃至新范式会是未来的趋势。
\vspace{0.5em} \vspace{0.5em}
\item 更多上下文信息的建模。由于人类语言潜在的歧义性,传统的神经机器翻译在单句翻译中可能会出现歧义。为此,一些研究工作在翻译过程中尝试引入更多的上下文信息,比如多模态翻译、基于树的翻译或者篇章级翻译。多模态翻译的目标就是在给定一个图片和其源语描述的情况下,生成目标语言的描述。一般做法就是通过一个额外的编码器来提取图像特征\cite{DBLP:journals/corr/ElliottFH15,DBLP:conf/acl/HitschlerSR16},然后通过权重门控机制、注意力网络等融合到系统中\cite{DBLP:conf/wmt/HuangLSOD16} \item 图片翻译。由于人类语言潜在的歧义性,传统的神经机器翻译在单句翻译中可能会出现歧义。为此,一些研究工作在翻译过程中尝试引入更多的上下文信息,比如多模态翻译、基于树的翻译或者篇章级翻译。比如,图片翻译的目标就是在给定一个图片和其源语描述的情况下,生成目标语言的描述。一般做法就是通过一个额外的编码器来提取图像特征\cite{DBLP:journals/corr/ElliottFH15,DBLP:conf/acl/HitschlerSR16},然后通过权重门控机制、注意力网络等融合到系统中\cite{DBLP:conf/wmt/HuangLSOD16}
\parinterval 基于树的翻译是指在翻译模型中引入句法结构树或依存树,从而引入更多的句法信息。一种常用的做法是将句法树进行序列化,从而保留序列到序列的模型结构\cite{DBLP:conf/emnlp/CurreyH18,DBLP:conf/acl/SaundersSGB18}。在此基础上,一些研究工作引入了更多的解析结果\cite{DBLP:conf/acl/SumitaUZTM18,DBLP:conf/coling/ZaremoodiH18}。同时,也有一些研究工作直接使用Tree-LSTMs等网络结构\cite{DBLP:conf/acl/TaiSM15,DBLP:conf/iclr/ShenTSC19}来直接表示树结构,并将其应用到神经机器翻译模型中\cite{DBLP:conf/acl/EriguchiHT16,Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17} \vspace{0.5em}
\item 基于树的翻译。这类方法在翻译模型中引入句法结构树或依存树,从而引入更多的句法信息。一种常用的做法是将句法树进行序列化,从而保留序列到序列的模型结构\cite{DBLP:conf/emnlp/CurreyH18,DBLP:conf/acl/SaundersSGB18}。在此基础上,一些研究工作引入了更多的解析结果\cite{DBLP:conf/acl/SumitaUZTM18,DBLP:conf/coling/ZaremoodiH18}。同时,也有一些研究工作直接使用Tree-LSTMs等网络结构\cite{DBLP:conf/acl/TaiSM15,DBLP:conf/iclr/ShenTSC19}来直接表示树结构,并将其应用到神经机器翻译模型中\cite{DBLP:conf/acl/EriguchiHT16,Yang2017TowardsBH,DBLP:conf/acl/ChenHCC17}
\parinterval 篇章级翻译是为了引入篇章级上下文信息,来处理篇章翻译中译文不连贯,主谓不一致等歧义现象。为此,一些研究人员针对该问题进行了改进,主要可以分为两类方法:一种是将当前句子与上下文进行句子级的拼接,不改变模型的结构\cite{DBLP:conf/discomt/TiedemannS17},另外一种是采用额外的编码器来捕获篇章信息\cite{DBLP:journals/corr/JeanLFC17,DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。编码器的结构除了传统的RNN、自注意力网络,还有利用层级注意力来编码之前的多句上文\cite{Werlen2018DocumentLevelNM,tan-etal-2019-hierarchical},使用可选择的稀疏注意力机制对整个文档进行篇章建模\cite{DBLP:conf/naacl/MarufMH19},使用记忆网络、缓存机制等对篇章中的关键词进行提取\cite{DBLP:conf/coling/KuangXLZ18,DBLP:journals/tacl/TuLSZ18}或者采用两阶段解码的方式\cite{DBLP:conf/aaai/XiongH0W19,DBLP:conf/acl/VoitaST19}。除了从建模角度引入上下文信息,也有一些工作使用篇章级修正模型\cite{DBLP:conf/emnlp/VoitaST19}或者语言模型\cite{DBLP:journals/corr/abs-1910-00553}对句子级翻译模型的译文进行修正,或者通过自学习在解码过程中保持翻译连贯性\cite{DBLP:journals/corr/abs-2003-05259} \vspace{0.5em}
\item 篇章级翻译。可以通过引入篇章级上下文信息,来处理篇章翻译中译文不连贯,主谓不一致等问题。为此,一些研究人员针对该问题进行了改进,主要可以分为两类方法:一种是将当前句子与上下文进行句子级的拼接,不改变模型的结构\cite{DBLP:conf/discomt/TiedemannS17},另外一种是采用额外的编码器来捕获篇章信息\cite{DBLP:journals/corr/JeanLFC17,DBLP:journals/corr/abs-1805-10163,DBLP:conf/emnlp/ZhangLSZXZL18}。编码器的结构除了传统的RNN、自注意力网络,还有利用层级注意力来编码之前的多句上文\cite{Werlen2018DocumentLevelNM,tan-etal-2019-hierarchical},使用可选择的稀疏注意力机制对整个文档进行篇章建模\cite{DBLP:conf/naacl/MarufMH19},使用记忆网络、缓存机制等对篇章中的关键词进行提取\cite{DBLP:conf/coling/KuangXLZ18,DBLP:journals/tacl/TuLSZ18}或者采用两阶段解码的方式\cite{DBLP:conf/aaai/XiongH0W19,DBLP:conf/acl/VoitaST19}。除了从建模角度引入上下文信息,也有一些工作使用篇章级修正模型\cite{DBLP:conf/emnlp/VoitaST19}或者语言模型\cite{DBLP:journals/corr/abs-1910-00553}对句子级翻译模型的译文进行修正,或者通过自学习在解码过程中保持翻译连贯性\cite{DBLP:journals/corr/abs-2003-05259}
\vspace{0.5em} \vspace{0.5em}
\item 语音翻译。在日常生活中,语音翻译也是有很大的需求。针对语音到文本翻译的特点,最简单的做法是使用自动语音识别(ASR)将语音转换成文本,然后送入文本翻译模型进行翻译\cite{DBLP:conf/icassp/Ney99,DBLP:conf/interspeech/MatusovKN05}。然而为了避免流水线中的错误传播和高延迟问题,现在通常采用端到端的建模做法\cite{DBLP:conf/naacl/DuongACBC16,DBLP:journals/corr/BerardPSB16}。同时,针对语音翻译数据稀缺的问题,一些研究工作采用各种方法来进行缓解,包括预训练\cite{DBLP:conf/naacl/BansalKLLG19}、多任务学习\cite{Weiss2017SequencetoSequenceMC,DBLP:conf/icassp/BerardBKP18}、课程学习\cite{DBLP:conf/interspeech/KanoS017}、注意力传递\cite{DBLP:journals/tacl/SperberNNW19}和知识精炼\cite{DBLP:conf/interspeech/LiuXZHWWZ19,DBLP:conf/icassp/JiaJMWCCALW19} \item 语音翻译。在日常生活中,语音翻译也是有很大的需求。针对语音到文本翻译的特点,最简单的做法是使用自动语音识别(ASR)将语音转换成文本,然后送入文本翻译模型进行翻译\cite{DBLP:conf/icassp/Ney99,DBLP:conf/interspeech/MatusovKN05}。然而为了避免流水线中的错误传播和高延迟问题,现在通常采用端到端的建模做法\cite{DBLP:conf/naacl/DuongACBC16,DBLP:journals/corr/BerardPSB16}。同时,针对语音翻译数据稀缺的问题,一些研究工作采用各种方法来进行缓解,包括预训练\cite{DBLP:conf/naacl/BansalKLLG19}、多任务学习\cite{Weiss2017SequencetoSequenceMC,DBLP:conf/icassp/BerardBKP18}、课程学习\cite{DBLP:conf/interspeech/KanoS017}、注意力传递\cite{DBLP:journals/tacl/SperberNNW19}和知识精炼\cite{DBLP:conf/interspeech/LiuXZHWWZ19,DBLP:conf/icassp/JiaJMWCCALW19}
\vspace{0.5em} \vspace{0.5em}
......
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{node} = [minimum height=1.0*1.2em,draw=teal,fill=teal!10] \tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
\tikzstyle{legend} = [minimum height=1.0*1.2em,minimum width=1.0*1.2em,draw] \tikzstyle{legend} = [minimum height=1.0*1.2em,minimum width=1.0*1.2em,draw]
\tikzstyle{node2} = [minimum width=1.0*1.2em,minimum height=4.1*1.2em,draw=blue,fill=blue!10] \tikzstyle{node2} = [minimum width=1.0*1.2em,minimum height=4.1*1.2em,draw,fill=blue!20]
\node[node,minimum width=2.8*1.2em] (node1) at (0,0) {}; \node[node,minimum width=2.8*1.2em] (node1) at (0,0) {};
\node[node,minimum width=4.0*1.2em,anchor=north west] (node2) at (node1.south west) {}; \node[node,minimum width=4.0*1.2em,anchor=north west] (node2) at (node1.south west) {};
\node[node,minimum width=3.2*1.2em,anchor=north west] (node3) at (node2.south west) {}; \node[node,minimum width=3.2*1.2em,anchor=north west] (node3) at (node2.south west) {};
...@@ -12,12 +12,12 @@ ...@@ -12,12 +12,12 @@
\node[node,minimum width=2.8*1.2em,anchor=north west] (node6) at (node5.south west) {}; \node[node,minimum width=2.8*1.2em,anchor=north west] (node6) at (node5.south west) {};
\node[node,minimum width=3.2*1.2em,anchor=north west] (node7) at (node6.south west) {}; \node[node,minimum width=3.2*1.2em,anchor=north west] (node7) at (node6.south west) {};
\node[node,minimum width=4.0*1.2em,anchor=north west] (node8) at (node7.south west) {}; \node[node,minimum width=4.0*1.2em,anchor=north west] (node8) at (node7.south west) {};
\node[font=\footnotesize,anchor=east] (line1) at (node1.west) {gpu1}; \node[font=\footnotesize,anchor=east] (line1) at (node1.west) {GPU1};
\node[font=\footnotesize,anchor=east] (line2) at (node2.west) {gpu2}; \node[font=\footnotesize,anchor=east] (line2) at (node2.west) {GPU2};
\node[font=\footnotesize,anchor=east] (line3) at (node3.west) {gpu3}; \node[font=\footnotesize,anchor=east] (line3) at (node3.west) {GPU3};
\node[font=\footnotesize,anchor=east] (line4) at (node4.west) {gpu4}; \node[font=\footnotesize,anchor=east] (line4) at (node4.west) {GPU4};
\node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {}; \node[node2,anchor = north west] (grad2) at ([xshift=0.3em]node5.north east) {};
\draw[->] (-1.4em*1.2,-3.62*1.2em) -- (9em*1.2,-3.62*1.2em); \draw[->,thick] (-1.4em*1.2,-3.62*1.2em) -- (9em*1.2,-3.62*1.2em);
\node[node,minimum width=2.8*1.2em] (node9) at (16em,0) {}; \node[node,minimum width=2.8*1.2em] (node9) at (16em,0) {};
\node[node,minimum width=4.0*1.2em,anchor=north west] (node10) at (node9.south west) {}; \node[node,minimum width=4.0*1.2em,anchor=north west] (node10) at (node9.south west) {};
...@@ -29,11 +29,11 @@ ...@@ -29,11 +29,11 @@
\node[node,minimum width=3.2*1.2em,anchor=north west] (node15) at (node11.north east) {}; \node[node,minimum width=3.2*1.2em,anchor=north west] (node15) at (node11.north east) {};
\node[node,minimum width=4.0*1.2em,anchor=north west] (node16) at (node12.north east) {}; \node[node,minimum width=4.0*1.2em,anchor=north west] (node16) at (node12.north east) {};
\node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {}; \node[node2,anchor = north west] (grad3) at ([xshift=0.5em]node13.north east) {};
\node[font=\footnotesize,anchor=east] (line1) at (node9.west) {gpu1}; \node[font=\footnotesize,anchor=east] (line1) at (node9.west) {GPU1};
\node[font=\footnotesize,anchor=east] (line2) at (node10.west) {gpu2}; \node[font=\footnotesize,anchor=east] (line2) at (node10.west) {GPU2};
\node[font=\footnotesize,anchor=east] (line3) at (node11.west) {gpu3}; \node[font=\footnotesize,anchor=east] (line3) at (node11.west) {GPU3};
\node[font=\footnotesize,anchor=east] (line4) at (node12.west) {gpu4}; \node[font=\footnotesize,anchor=east] (line4) at (node12.west) {GPU4};
\draw[->] (13.6*1.2em,-3.62*1.2em) -- (20.5*1.2em,-3.62*1.2em); \draw[->,thick] (node12.south west) -- ([xshift=3em]node16.south east);
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {}; \node [rectangle,inner sep=-0.0em,draw] [fit = (node1) (node2) (node3) (node4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {}; \node [rectangle,inner sep=-0.0em,draw] [fit = (node5) (node6) (node7) (node8)] (box2) {};
...@@ -46,9 +46,9 @@ ...@@ -46,9 +46,9 @@
\node[legend] (legend3) at (2em,2em) {}; \node[legend] (legend3) at (2em,2em) {};
\node[font=\footnotesize,anchor=west] (idle) at (legend3.east) {:空闲}; \node[font=\footnotesize,anchor=west] (idle) at (legend3.east) {:空闲};
\node[legend,anchor=west,draw=teal,fill=teal!10] (legend4) at ([xshift = 2em]idle.east) {}; \node[legend,anchor=west,draw,fill=green!30] (legend4) at ([xshift = 2em]idle.east) {};
\node[font=\footnotesize,anchor=west] (FB) at (legend4.east) {:前向/反向}; \node[font=\footnotesize,anchor=west] (FB) at (legend4.east) {:前向/反向};
\node[legend,anchor=west,draw=blue,fill=blue!10] (legend5) at ([xshift = 2em]FB.east) {}; \node[legend,anchor=west,draw,fill=blue!30] (legend5) at ([xshift = 2em]FB.east) {};
\node[font=\footnotesize,anchor=west] (grad_sync) at (legend5.east) {:梯度更新}; \node[font=\footnotesize,anchor=west] (grad_sync) at (legend5.east) {:梯度更新};
\end{tikzpicture} \end{tikzpicture}
\ No newline at end of file
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
\draw [-,very thick,draw=ublue] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:7em) and +(south:0em) .. ([xshift=17em,yshift=9em]n1.north); \draw [-,very thick,draw=ublue] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:7em) and +(south:0em) .. ([xshift=17em,yshift=9em]n1.north);
{\footnotesize {\footnotesize
\node [anchor=south] (n4) at ([xshift=7em,yshift=5em]n1.north) {性能快速爬升阶段}; \node [anchor=south] (n4) at ([xshift=8em,yshift=5em]n1.north) {性能快速爬升阶段(红色)};
\node [anchor=west] (n5) at ([xshift=0em,yshift=-2em]n4.west) {数据的作用会非常明显}; \node [anchor=west] (n5) at ([xshift=0em,yshift=-2em]n4.west) {数据的作用会非常明显};
\draw [-,very thick,draw=red] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:5.9em) and +(south:0em) .. ([xshift=10em,yshift=9.6em]n1.north); \draw [-,very thick,draw=red] ([xshift=0.7em,yshift=3em]n1.north) .. controls +(north:5.9em) and +(south:0em) .. ([xshift=10em,yshift=9.6em]n1.north);
......
\begin{tikzpicture} \begin{tikzpicture}
\tikzstyle{node} = [minimum height=1.0*1.2em,draw=teal,fill=teal!10] \tikzstyle{node} = [minimum height=1.0*1.2em,draw,fill=green!20]
\node[node,minimum width=2.0*1.2em] (sent1) at (0,0) {}; \node[node,minimum width=2.0*1.2em] (sent1) at (0,0) {};
\node[node,minimum width=5.0*1.2em,anchor=north west] (sent2) at (sent1.south west) {}; \node[node,minimum width=5.0*1.2em,anchor=north west] (sent2) at (sent1.south west) {};
\node[node,minimum width=1.0*1.2em,anchor=north west] (sent3) at (sent2.south west) {}; \node[node,minimum width=1.0*1.2em,anchor=north west] (sent3) at (sent2.south west) {};
...@@ -11,15 +11,15 @@ ...@@ -11,15 +11,15 @@
\node[node,minimum width=4.5*1.2em,anchor=north west] (sent7) at (sent6.south west) {}; \node[node,minimum width=4.5*1.2em,anchor=north west] (sent7) at (sent6.south west) {};
\node[node,minimum width=5*1.2em,anchor=north west] (sent8) at (sent7.south west) {}; \node[node,minimum width=5*1.2em,anchor=north west] (sent8) at (sent7.south west) {};
\node[font=\footnotesize,anchor=east] (line1) at (sent1.west) {sent1}; \node[font=\footnotesize,anchor=east] (line1) at (sent1.west) {句子1};
\node[font=\footnotesize,anchor=east] (line2) at (sent2.west) {sent2}; \node[font=\footnotesize,anchor=east] (line2) at (sent2.west) {句子2};
\node[font=\footnotesize,anchor=east] (line3) at (sent3.west) {sent3}; \node[font=\footnotesize,anchor=east] (line3) at (sent3.west) {句子3};
\node[font=\footnotesize,anchor=east] (line4) at (sent4.west) {sent4}; \node[font=\footnotesize,anchor=east] (line4) at (sent4.west) {句子4};
\node[font=\footnotesize,anchor=east] (line5) at (sent5.west) {sent1}; \node[font=\footnotesize,anchor=east] (line5) at (sent5.west) {句子1};
\node[font=\footnotesize,anchor=east] (line6) at (sent6.west) {sent2}; \node[font=\footnotesize,anchor=east] (line6) at (sent6.west) {句子2};
\node[font=\footnotesize,anchor=east] (line7) at (sent7.west) {sent3}; \node[font=\footnotesize,anchor=east] (line7) at (sent7.west) {句子3};
\node[font=\footnotesize,anchor=east] (line8) at (sent8.west) {sent4}; \node[font=\footnotesize,anchor=east] (line8) at (sent8.west) {句子4};
\begin{pgfonlayer}{background} \begin{pgfonlayer}{background}
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {}; \node [rectangle,inner sep=-0.0em,draw] [fit = (sent1) (sent2) (sent3) (sent4)] (box1) {};
\node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {}; \node [rectangle,inner sep=-0.0em,draw] [fit = (sent5) (sent6) (sent7) (sent8)] (box2) {};
......
...@@ -3,16 +3,16 @@ ...@@ -3,16 +3,16 @@
\node[] (do) at (0,0) {{\red do}}; \node[] (do) at (0,0) {{\red do}};
\node[anchor = west] (does) at ([xshift = 1em]do.east) {{\red do}es}; \node[anchor = west] (does) at ([xshift = 1em]do.east) {{\red do}es};
\node[anchor = west] (doing) at ([xshift = 0.7em]does.east) {{\red do}ing}; \node[anchor = west] (doing) at ([xshift = 0.7em]does.east) {{\red do}ing};
\node[anchor = north] (do_root) at ([yshift = -1em]does.south) {do}; \node[anchor = north] (do_root) at ([yshift = -1.5em]does.south) {do};
\node[anchor = west] (new) at ([xshift = 2em]doing.east) {{\red new}}; \node[anchor = west] (new) at ([xshift = 2em]doing.east) {{\red new}};
\node[anchor = west] (newer) at ([xshift = 1em]new.east) {{\red new}er}; \node[anchor = west] (newer) at ([xshift = 1em]new.east) {{\red new}er};
\node[anchor = west] (newest) at ([xshift = 0.7em]newer.east) {{\red new}est}; \node[anchor = west] (newest) at ([xshift = 0.7em]newer.east) {{\red new}est};
\node[anchor = north] (new_root) at ([yshift = -1em]newer.south) {new}; \node[anchor = north] (new_root) at ([yshift = -1.5em]newer.south) {new};
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south); \draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(do.south);
\draw [->] (do_root.north) -- (does.south); \draw [->] (do_root.north) -- (does.south);
\draw [->] (do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south); \draw [->] ([yshift=0.2em]do_root.north) .. controls +(north:0.4) and +(south:0.6) ..(doing.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south); \draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(new.south);
\draw [->] (new_root.north) -- (newer.south); \draw [->] (new_root.north) -- (newer.south);
\draw [->] (new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south); \draw [->] ([yshift=0.2em]new_root.north) .. controls +(north:0.4) and +(south:0.6) ..(newest.south);
\end{tikzpicture} \end{tikzpicture}
\ No newline at end of file
...@@ -2,6 +2,14 @@ ...@@ -2,6 +2,14 @@
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%----------------------------------------------------------------------------------------
% CONFIGURATIONS % CONFIGURATIONS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
...@@ -208,7 +216,7 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N( ...@@ -208,7 +216,7 @@ S = N(b^{\infty}(V(\mathbf{s}|\mathbf{t};2))) \cup (\mathop{\cup}\limits_{ij} N(
\parinterval 为了理解这个公式,先介绍几个概念。 \parinterval 为了理解这个公式,先介绍几个概念。
\begin{itemize} \begin{itemize}
\item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐,$V(\mathbf{s}|\mathbf{t},1)$$V(\mathbf{s}|\mathbf{t},2)$$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐; \item $V(\mathbf{s}|\mathbf{t})$表示Viterbi词对齐,$V(\mathbf{s}|\mathbf{t},1)$$V(\mathbf{s}|\mathbf{t},2)$$V(\mathbf{s}|\mathbf{t},3)$就分别对应了模型1、2 和3 的Viterbi 词对齐;
\item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词($a_j=i$)的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\textrm{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$ \item 把那些满足第$j$个源语言单词对应第$i$个目标语言单词($a_j=i$)的词对齐构成的集合记为$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$。通常称这些对齐中$j$$i$被``钉''在了一起。在$\mathbf{A}_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$中使$\textrm{P}(\mathbf{a}|\mathbf{s},\mathbf{t})$达到最大的那个词对齐被记为$V_{i \leftrightarrow j}(\mathbf{s},\mathbf{t})$
\item 如果两个词对齐,通过交换两个词对齐连接就能互相转化,则称它们为邻居。一个词对齐$\mathbf{a}$的所有邻居记为$N(\mathbf{a})$ \item 如果两个词对齐,通过交换两个词对齐连接就能互相转化,则称它们为邻居。一个词对齐$\mathbf{a}$的所有邻居记为$N(\mathbf{a})$
\end{itemize} \end{itemize}
......
% !Mode:: "TeX:UTF-8" % !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
\renewcommand\figurename{} \renewcommand\figurename{}
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
\indexentry{组合性翻译|hyperpage}{10} \indexentry{源语言|hyperpage}{17}
\indexentry{Compositional Translation|hyperpage}{10} \indexentry{Source Language|hyperpage}{17}
\indexentry{短语|hyperpage}{10} \indexentry{目标语言|hyperpage}{17}
\indexentry{短语切分|hyperpage}{15} \indexentry{Target Language|hyperpage}{17}
\indexentry{Phrasal Segmentation|hyperpage}{15} \indexentry{机器翻译|hyperpage}{18}
\indexentry{短语对|hyperpage}{15} \indexentry{Machine Translation|hyperpage}{18}
\indexentry{推导|hyperpage}{15} \indexentry{数据驱动|hyperpage}{23}
\indexentry{Derivation|hyperpage}{15} \indexentry{Data-Driven|hyperpage}{23}
\indexentry{生成式模型|hyperpage}{18} \indexentry{编码器-解码器|hyperpage}{30}
\indexentry{Generative Model|hyperpage}{18} \indexentry{encoder-decoder|hyperpage}{30}
\indexentry{判别式模型|hyperpage}{18} \indexentry{质量评价|hyperpage}{32}
\indexentry{Discriminative Model|hyperpage}{18} \indexentry{Quality Evaluation|hyperpage}{32}
\indexentry{对数线性模型|hyperpage}{19} \indexentry{无参考答案的评价|hyperpage}{32}
\indexentry{Log-linear Model|hyperpage}{19} \indexentry{Quality Estimation|hyperpage}{32}
\indexentry{短语抽取|hyperpage}{20} \indexentry{$n$元语法单元|hyperpage}{33}
\indexentry{Phrase Extraction|hyperpage}{20} \indexentry{$n$-gram准确率|hyperpage}{34}
\indexentry{词汇化翻译概率|hyperpage}{23} \indexentry{$n$-gram Precision|hyperpage}{34}
\indexentry{Lexical Translation Probability|hyperpage}{23} \indexentry{短句惩罚因子|hyperpage}{34}
\indexentry{短语表|hyperpage}{23} \indexentry{Brevity Penalty|hyperpage}{34}
\indexentry{Phrase Table|hyperpage}{23} \indexentry{分词|hyperpage}{50}
\indexentry{调序|hyperpage}{24} \indexentry{Segmentation|hyperpage}{50}
\indexentry{Reordering|hyperpage}{24} \indexentry{句法分析|hyperpage}{51}
\indexentry{模型训练|hyperpage}{28} \indexentry{Parsing|hyperpage}{51}
\indexentry{Model Training|hyperpage}{28} \indexentry{预处理|hyperpage}{51}
\indexentry{权重调优|hyperpage}{28} \indexentry{Pre-processing|hyperpage}{51}
\indexentry{Weight Tuning|hyperpage}{28} \indexentry{后处理|hyperpage}{51}
\indexentry{最小错误率训练|hyperpage}{28} \indexentry{Post-processing|hyperpage}{51}
\indexentry{Minimum Error Rate Training|hyperpage}{28} \indexentry{事件|hyperpage}{52}
\indexentry{调优集合|hyperpage}{28} \indexentry{Event|hyperpage}{52}
\indexentry{Tuning Set|hyperpage}{28} \indexentry{随机事件|hyperpage}{52}
\indexentry{线搜索|hyperpage}{29} \indexentry{随机变量|hyperpage}{52}
\indexentry{Line Search|hyperpage}{29} \indexentry{Random Variable|hyperpage}{52}
\indexentry{格搜索|hyperpage}{30} \indexentry{概率|hyperpage}{52}
\indexentry{Grid Search|hyperpage}{30} \indexentry{Probability|hyperpage}{52}
\indexentry{覆盖度模型|hyperpage}{32} \indexentry{估计|hyperpage}{52}
\indexentry{Coverage Model|hyperpage}{32} \indexentry{估计值|hyperpage}{52}
\indexentry{翻译候选|hyperpage}{32} \indexentry{Estimate|hyperpage}{52}
\indexentry{Translation Candidate|hyperpage}{32} \indexentry{概率分布函数|hyperpage}{53}
\indexentry{翻译假设|hyperpage}{33} \indexentry{概率密度函数|hyperpage}{53}
\indexentry{Translation Hypothesis|hyperpage}{33} \indexentry{联合概率|hyperpage}{53}
\indexentry{剪枝|hyperpage}{34} \indexentry{Joint Probability|hyperpage}{53}
\indexentry{Pruning|hyperpage}{34} \indexentry{条件概率|hyperpage}{53}
\indexentry{束剪枝|hyperpage}{34} \indexentry{Conditional Probability|hyperpage}{53}
\indexentry{Beam Pruning|hyperpage}{34} \indexentry{边缘概率|hyperpage}{54}
\indexentry{直方图剪枝|hyperpage}{34} \indexentry{marginal probability|hyperpage}{54}
\indexentry{Histogram Pruning|hyperpage}{34} \indexentry{全概率公式|hyperpage}{55}
\indexentry{阈值剪枝|hyperpage}{34} \indexentry{Law of Total Probability|hyperpage}{55}
\indexentry{Threshold Pruning|hyperpage}{34} \indexentry{贝叶斯法则|hyperpage}{56}
\indexentry{假设重组|hyperpage}{34} \indexentry{Bayes' rule|hyperpage}{56}
\indexentry{Hypothesis Recombination|hyperpage}{34} \indexentry{熵|hyperpage}{57}
\indexentry{基于层次短语的模型|hyperpage}{38} \indexentry{Entropy|hyperpage}{57}
\indexentry{Hierarchical Phrase-based Model|hyperpage}{38} \indexentry{自信息|hyperpage}{57}
\indexentry{同步上下文无关文法|hyperpage}{39} \indexentry{Self-information|hyperpage}{57}
\indexentry{Synchronous Context-free Grammar|hyperpage}{39} \indexentry{相对熵|hyperpage}{58}
\indexentry{基于层次短语的文法|hyperpage}{40} \indexentry{Relative Entropy|hyperpage}{58}
\indexentry{Hierarchical Phrase-based Grammar|hyperpage}{40} \indexentry{交叉熵|hyperpage}{58}
\indexentry{推导|hyperpage}{41} \indexentry{Cross-entropy|hyperpage}{58}
\indexentry{Derivation|hyperpage}{41} \indexentry{分词|hyperpage}{59}
\indexentry{胶水规则|hyperpage}{41} \indexentry{Segmentation|hyperpage}{59}
\indexentry{Glue Rule|hyperpage}{41} \indexentry{单词|hyperpage}{59}
\indexentry{乔姆斯基范式|hyperpage}{45} \indexentry{Word|hyperpage}{59}
\indexentry{Chomsky Normal Form|hyperpage}{45} \indexentry{词|hyperpage}{59}
\indexentry{跨度|hyperpage}{45} \indexentry{词法分析|hyperpage}{59}
\indexentry{Span|hyperpage}{45} \indexentry{Lexical Analysis|hyperpage}{59}
\indexentry{自下而上的分析|hyperpage}{46} \indexentry{标注数据|hyperpage}{61}
\indexentry{Top-down Parsing|hyperpage}{46} \indexentry{Annotated Data|hyperpage}{61}
\indexentry{束剪枝|hyperpage}{48} \indexentry{训练|hyperpage}{61}
\indexentry{Beam Pruning|hyperpage}{48} \indexentry{Training|hyperpage}{61}
\indexentry{立方剪枝|hyperpage}{50} \indexentry{推断|hyperpage}{61}
\indexentry{Cube Pruning|hyperpage}{50} \indexentry{Inference|hyperpage}{61}
\indexentry{序列化|hyperpage}{53} \indexentry{参数估计|hyperpage}{63}
\indexentry{线性化|hyperpage}{53} \indexentry{Parameter Estimation|hyperpage}{63}
\indexentry{Linearization|hyperpage}{53} \indexentry{偏置|hyperpage}{63}
\indexentry{树到串翻译规则|hyperpage}{55} \indexentry{Bias|hyperpage}{63}
\indexentry{Tree-to-String Translation Rule|hyperpage}{55} \indexentry{语言模型|hyperpage}{67}
\indexentry{树到树翻译规则|hyperpage}{55} \indexentry{Language Model|hyperpage}{67}
\indexentry{Tree-to-Tree Translation Rule|hyperpage}{55} \indexentry{语言建模|hyperpage}{67}
\indexentry{树片段|hyperpage}{56} \indexentry{Language Modeling|hyperpage}{67}
\indexentry{Tree Fragment|hyperpage}{56} \indexentry{极大似然估计|hyperpage}{68}
\indexentry{同步树替换文法规则|hyperpage}{57} \indexentry{人工神经网络方法|hyperpage}{68}
\indexentry{Synchronous Tree Substitution Grammar Rule|hyperpage}{57} \indexentry{未登录词|hyperpage}{69}
\indexentry{边缘集合|hyperpage}{63} \indexentry{Out-of-Vocabulary Word,OOV Word|hyperpage}{69}
\indexentry{Frontier Set|hyperpage}{63} \indexentry{加法平滑|hyperpage}{70}
\indexentry{最小规则|hyperpage}{64} \indexentry{Additive Smoothing|hyperpage}{70}
\indexentry{Minimal Rules|hyperpage}{64} \indexentry{古德-图灵估计法|hyperpage}{71}
\indexentry{二叉化|hyperpage}{67} \indexentry{Good-Turing Estimate|hyperpage}{71}
\indexentry{Binarization|hyperpage}{67} \indexentry{句法|hyperpage}{74}
\indexentry{基于短语的特征|hyperpage}{72} \indexentry{Syntax|hyperpage}{74}
\indexentry{基于句法的特征|hyperpage}{72} \indexentry{短语结构分析|hyperpage}{74}
\indexentry{有向超图|hyperpage}{73} \indexentry{Phrase Structure Parsing|hyperpage}{74}
\indexentry{Directed Hyper-graph|hyperpage}{73} \indexentry{依存分析|hyperpage}{74}
\indexentry{超边|hyperpage}{73} \indexentry{Dependency Parsing|hyperpage}{74}
\indexentry{Hyper-edge|hyperpage}{73} \indexentry{成分分析|hyperpage}{75}
\indexentry{半环分析|hyperpage}{73} \indexentry{完全分析|hyperpage}{75}
\indexentry{Semi-ring Parsing|hyperpage}{73} \indexentry{Full Parsing|hyperpage}{75}
\indexentry{组合|hyperpage}{75} \indexentry{终结符|hyperpage}{75}
\indexentry{Composition|hyperpage}{75} \indexentry{Terminal|hyperpage}{75}
\indexentry{基于串的解码|hyperpage}{76} \indexentry{预终结符|hyperpage}{75}
\indexentry{String-based Decoding|hyperpage}{76} \indexentry{Pre-terminal|hyperpage}{75}
\indexentry{基于树的解码|hyperpage}{76} \indexentry{非终结符|hyperpage}{75}
\indexentry{Tree-based Decoding|hyperpage}{76} \indexentry{Non-terminal|hyperpage}{75}
\indexentry{Lexicalized Norm Form|hyperpage}{79} \indexentry{上下文无关文法|hyperpage}{76}
\indexentry{Context-Free Grammar|hyperpage}{76}
\indexentry{产生式规则|hyperpage}{77}
\indexentry{Production Rule|hyperpage}{77}
\indexentry{推导|hyperpage}{78}
\indexentry{Derivation|hyperpage}{78}
\indexentry{句子|hyperpage}{79}
\indexentry{Sentence|hyperpage}{79}
\indexentry{语言|hyperpage}{79}
\indexentry{Language|hyperpage}{79}
\indexentry{歧义|hyperpage}{79}
\indexentry{Ambiguity|hyperpage}{79}
\indexentry{消歧|hyperpage}{79}
\indexentry{Disambiguation|hyperpage}{79}
\indexentry{最左优先推导|hyperpage}{79}
\indexentry{Left-most Derivation|hyperpage}{79}
\indexentry{概率上下文无关文法|hyperpage}{81}
\indexentry{Probabilistic Context-Free Grammar|hyperpage}{81}
\indexentry{树库|hyperpage}{82}
\indexentry{Treebank|hyperpage}{82}
\indexentry{生成模型|hyperpage}{83}
\indexentry{Generative Model|hyperpage}{83}
\indexentry{判别模型|hyperpage}{83}
\indexentry{Discriminative Model|hyperpage}{83}
\indexentry{流畅度|hyperpage}{88}
\indexentry{Fluency|hyperpage}{88}
\indexentry{准确性|hyperpage}{88}
\indexentry{Accuracy|hyperpage}{88}
\indexentry{充分性|hyperpage}{88}
\indexentry{Adequacy|hyperpage}{88}
\indexentry{翻译候选|hyperpage}{89}
\indexentry{Translation Candidate|hyperpage}{89}
\indexentry{训练|hyperpage}{91}
\indexentry{Training|hyperpage}{91}
\indexentry{解码|hyperpage}{91}
\indexentry{Decoding|hyperpage}{91}
\indexentry{推断|hyperpage}{91}
\indexentry{Inference|hyperpage}{91}
\indexentry{词对齐|hyperpage}{96}
\indexentry{Word Alignment|hyperpage}{96}
\indexentry{词对齐连接|hyperpage}{96}
\indexentry{解码|hyperpage}{99}
\indexentry{Decoding|hyperpage}{99}
\indexentry{噪声信道模型|hyperpage}{102}
\indexentry{Noise Channel Model|hyperpage}{102}
\indexentry{词对齐|hyperpage}{104}
\indexentry{Word Alignment|hyperpage}{104}
\indexentry{非对称的词对齐|hyperpage}{105}
\indexentry{Asymmetric Word Alignment|hyperpage}{105}
\indexentry{空对齐|hyperpage}{105}
\indexentry{拉格朗日乘数法|hyperpage}{113}
\indexentry{The Lagrange Multiplier Method|hyperpage}{113}
\indexentry{期望最大化|hyperpage}{115}
\indexentry{Expectation Maximization|hyperpage}{115}
\indexentry{期望频次|hyperpage}{116}
\indexentry{Expected Count|hyperpage}{116}
\indexentry{产出率|hyperpage}{119}
\indexentry{繁衍率|hyperpage}{119}
\indexentry{Fertility|hyperpage}{119}
\indexentry{扭曲度|hyperpage}{121}
\indexentry{Distortion|hyperpage}{121}
\indexentry{概念单元|hyperpage}{123}
\indexentry{概念|hyperpage}{123}
\indexentry{Concept|hyperpage}{123}
\indexentry{缺陷|hyperpage}{125}
\indexentry{Deficiency|hyperpage}{125}
\indexentry{凸函数|hyperpage}{129}
\indexentry{Convex function|hyperpage}{129}
\indexentry{对称化|hyperpage}{130}
\indexentry{Symmetrization|hyperpage}{130}
\indexentry{系统偏置|hyperpage}{131}
\indexentry{System Bias|hyperpage}{131}
\indexentry{组合性翻译|hyperpage}{136}
\indexentry{Compositional Translation|hyperpage}{136}
\indexentry{短语|hyperpage}{136}
\indexentry{短语切分|hyperpage}{141}
\indexentry{Phrasal Segmentation|hyperpage}{141}
\indexentry{短语对|hyperpage}{141}
\indexentry{推导|hyperpage}{141}
\indexentry{Derivation|hyperpage}{141}
\indexentry{生成式模型|hyperpage}{144}
\indexentry{Generative Model|hyperpage}{144}
\indexentry{判别式模型|hyperpage}{144}
\indexentry{Discriminative Model|hyperpage}{144}
\indexentry{对数线性模型|hyperpage}{145}
\indexentry{Log-linear Model|hyperpage}{145}
\indexentry{短语抽取|hyperpage}{146}
\indexentry{Phrase Extraction|hyperpage}{146}
\indexentry{词汇化翻译概率|hyperpage}{149}
\indexentry{Lexical Translation Probability|hyperpage}{149}
\indexentry{短语表|hyperpage}{150}
\indexentry{Phrase Table|hyperpage}{150}
\indexentry{调序|hyperpage}{150}
\indexentry{Reordering|hyperpage}{150}
\indexentry{模型训练|hyperpage}{154}
\indexentry{Model Training|hyperpage}{154}
\indexentry{权重调优|hyperpage}{154}
\indexentry{Weight Tuning|hyperpage}{154}
\indexentry{最小错误率训练|hyperpage}{154}
\indexentry{Minimum Error Rate Training|hyperpage}{154}
\indexentry{调优集合|hyperpage}{154}
\indexentry{Tuning Set|hyperpage}{154}
\indexentry{线搜索|hyperpage}{155}
\indexentry{Line Search|hyperpage}{155}
\indexentry{格搜索|hyperpage}{156}
\indexentry{Grid Search|hyperpage}{156}
\indexentry{覆盖度模型|hyperpage}{158}
\indexentry{Coverage Model|hyperpage}{158}
\indexentry{翻译候选|hyperpage}{158}
\indexentry{Translation Candidate|hyperpage}{158}
\indexentry{翻译假设|hyperpage}{159}
\indexentry{Translation Hypothesis|hyperpage}{159}
\indexentry{剪枝|hyperpage}{160}
\indexentry{Pruning|hyperpage}{160}
\indexentry{束剪枝|hyperpage}{160}
\indexentry{Beam Pruning|hyperpage}{160}
\indexentry{直方图剪枝|hyperpage}{160}
\indexentry{Histogram Pruning|hyperpage}{160}
\indexentry{阈值剪枝|hyperpage}{160}
\indexentry{Threshold Pruning|hyperpage}{160}
\indexentry{假设重组|hyperpage}{160}
\indexentry{Hypothesis Recombination|hyperpage}{160}
\indexentry{基于层次短语的模型|hyperpage}{164}
\indexentry{Hierarchical Phrase-based Model|hyperpage}{164}
\indexentry{同步上下文无关文法|hyperpage}{165}
\indexentry{Synchronous Context-free Grammar|hyperpage}{165}
\indexentry{基于层次短语的文法|hyperpage}{166}
\indexentry{Hierarchical Phrase-based Grammar|hyperpage}{166}
\indexentry{推导|hyperpage}{167}
\indexentry{Derivation|hyperpage}{167}
\indexentry{胶水规则|hyperpage}{167}
\indexentry{Glue Rule|hyperpage}{167}
\indexentry{乔姆斯基范式|hyperpage}{171}
\indexentry{Chomsky Normal Form|hyperpage}{171}
\indexentry{跨度|hyperpage}{171}
\indexentry{Span|hyperpage}{171}
\indexentry{自下而上的分析|hyperpage}{172}
\indexentry{Top-down Parsing|hyperpage}{172}
\indexentry{束剪枝|hyperpage}{174}
\indexentry{Beam Pruning|hyperpage}{174}
\indexentry{立方剪枝|hyperpage}{176}
\indexentry{Cube Pruning|hyperpage}{176}
\indexentry{序列化|hyperpage}{179}
\indexentry{线性化|hyperpage}{179}
\indexentry{Linearization|hyperpage}{179}
\indexentry{树到串翻译规则|hyperpage}{181}
\indexentry{Tree-to-String Translation Rule|hyperpage}{181}
\indexentry{树到树翻译规则|hyperpage}{181}
\indexentry{Tree-to-Tree Translation Rule|hyperpage}{181}
\indexentry{树片段|hyperpage}{182}
\indexentry{Tree Fragment|hyperpage}{182}
\indexentry{同步树替换文法规则|hyperpage}{183}
\indexentry{Synchronous Tree Substitution Grammar Rule|hyperpage}{183}
\indexentry{边缘集合|hyperpage}{189}
\indexentry{Frontier Set|hyperpage}{189}
\indexentry{最小规则|hyperpage}{190}
\indexentry{Minimal Rules|hyperpage}{190}
\indexentry{二叉化|hyperpage}{194}
\indexentry{Binarization|hyperpage}{194}
\indexentry{基于短语的特征|hyperpage}{198}
\indexentry{基于句法的特征|hyperpage}{198}
\indexentry{有向超图|hyperpage}{199}
\indexentry{Directed Hyper-graph|hyperpage}{199}
\indexentry{超边|hyperpage}{199}
\indexentry{Hyper-edge|hyperpage}{199}
\indexentry{半环分析|hyperpage}{200}
\indexentry{Semi-ring Parsing|hyperpage}{200}
\indexentry{组合|hyperpage}{201}
\indexentry{Composition|hyperpage}{201}
\indexentry{基于串的解码|hyperpage}{201}
\indexentry{String-based Decoding|hyperpage}{201}
\indexentry{基于树的解码|hyperpage}{201}
\indexentry{Tree-based Decoding|hyperpage}{201}
\indexentry{Lexicalized Norm Form|hyperpage}{205}
\indexentry{人工神经网络|hyperpage}{211}
\indexentry{Artificial Neural Networks|hyperpage}{211}
\indexentry{神经网络|hyperpage}{211}
\indexentry{Neural Networks|hyperpage}{211}
\indexentry{深度学习|hyperpage}{212}
\indexentry{Deep Learning|hyperpage}{212}
\indexentry{连接主义|hyperpage}{213}
\indexentry{Connectionism|hyperpage}{213}
\indexentry{分布式表示|hyperpage}{213}
\indexentry{Distributed representation|hyperpage}{213}
\indexentry{符号主义|hyperpage}{213}
\indexentry{Symbolicism|hyperpage}{213}
\indexentry{端到端学习|hyperpage}{215}
\indexentry{End-to-End Learning|hyperpage}{215}
\indexentry{表示学习|hyperpage}{215}
\indexentry{Representation Learning|hyperpage}{215}
\indexentry{分布式表示|hyperpage}{216}
\indexentry{Distributed Representation|hyperpage}{216}
\indexentry{标量|hyperpage}{217}
\indexentry{Scalar|hyperpage}{217}
\indexentry{向量|hyperpage}{217}
\indexentry{Vector|hyperpage}{217}
\indexentry{矩阵|hyperpage}{217}
\indexentry{Matrix|hyperpage}{217}
\indexentry{转置|hyperpage}{218}
\indexentry{Transpose|hyperpage}{218}
\indexentry{按元素加法|hyperpage}{218}
\indexentry{Element-wise Addition|hyperpage}{218}
\indexentry{数乘|hyperpage}{219}
\indexentry{Scalar Multiplication|hyperpage}{219}
\indexentry{按元素乘积|hyperpage}{220}
\indexentry{Element-wise Product|hyperpage}{220}
\indexentry{线性映射|hyperpage}{220}
\indexentry{Linear Mapping|hyperpage}{220}
\indexentry{线性变换|hyperpage}{220}
\indexentry{Linear Transformation|hyperpage}{220}
\indexentry{范数|hyperpage}{221}
\indexentry{Norm|hyperpage}{221}
\indexentry{欧几里得范数|hyperpage}{222}
\indexentry{Euclidean Norm|hyperpage}{222}
\indexentry{Frobenius 范数|hyperpage}{222}
\indexentry{Frobenius Norm|hyperpage}{222}
\indexentry{权重|hyperpage}{223}
\indexentry{weight|hyperpage}{223}
\indexentry{张量|hyperpage}{233}
\indexentry{Tensor|hyperpage}{233}
\indexentry{阶|hyperpage}{233}
\indexentry{Rank|hyperpage}{233}
\indexentry{广播机制|hyperpage}{237}
\indexentry{向量化|hyperpage}{237}
\indexentry{Vectorization|hyperpage}{237}
\indexentry{前向传播|hyperpage}{241}
\indexentry{计算图|hyperpage}{242}
\indexentry{Computation Graph|hyperpage}{242}
\indexentry{模型参数|hyperpage}{243}
\indexentry{Model Parameters|hyperpage}{243}
\indexentry{训练|hyperpage}{243}
\indexentry{Training|hyperpage}{243}
\indexentry{有标注数据|hyperpage}{243}
\indexentry{Annotated Data/Labeled Data|hyperpage}{243}
\indexentry{有指导的训练|hyperpage}{243}
\indexentry{有监督的训练|hyperpage}{243}
\indexentry{Supervised Training|hyperpage}{243}
\indexentry{训练数据集合|hyperpage}{244}
\indexentry{Training Data Set|hyperpage}{244}
\indexentry{损失函数|hyperpage}{244}
\indexentry{Loss Function|hyperpage}{244}
\indexentry{目标函数|hyperpage}{244}
\indexentry{Objective Function|hyperpage}{244}
\indexentry{代价函数|hyperpage}{246}
\indexentry{Cost Function|hyperpage}{246}
\indexentry{梯度下降方法|hyperpage}{246}
\indexentry{Gradient Descent Method|hyperpage}{246}
\indexentry{参数更新的规则|hyperpage}{246}
\indexentry{Update Rule|hyperpage}{246}
\indexentry{学习率|hyperpage}{246}
\indexentry{Learning Rate|hyperpage}{246}
\indexentry{基于梯度的方法|hyperpage}{246}
\indexentry{Gradient-based Method|hyperpage}{246}
\indexentry{批量梯度下降|hyperpage}{247}
\indexentry{Batch Gradient Descent|hyperpage}{247}
\indexentry{随机梯度下降|hyperpage}{247}
\indexentry{Stochastic Gradient Descent|hyperpage}{247}
\indexentry{小批量梯度下降|hyperpage}{247}
\indexentry{Mini-Batch Gradient Descent|hyperpage}{247}
\indexentry{数值微分|hyperpage}{248}
\indexentry{Numerical Differentiation|hyperpage}{248}
\indexentry{截断误差|hyperpage}{248}
\indexentry{Truncation Error|hyperpage}{248}
\indexentry{舍入误差|hyperpage}{248}
\indexentry{Round-off Error|hyperpage}{248}
\indexentry{符号微分|hyperpage}{249}
\indexentry{Symbolic Differentiation|hyperpage}{249}
\indexentry{表达式膨胀|hyperpage}{249}
\indexentry{Expression Swell|hyperpage}{249}
\indexentry{自动微分|hyperpage}{249}
\indexentry{Automatic Differentiation|hyperpage}{249}
\indexentry{反向模式|hyperpage}{250}
\indexentry{Backward Mode|hyperpage}{250}
\indexentry{学习率|hyperpage}{251}
\indexentry{Learning Rate|hyperpage}{251}
\indexentry{Momentum|hyperpage}{251}
\indexentry{AdaGrad|hyperpage}{252}
\indexentry{衰减|hyperpage}{252}
\indexentry{Decay|hyperpage}{252}
\indexentry{RMSprop|hyperpage}{252}
\indexentry{Adam|hyperpage}{253}
\indexentry{数据并行|hyperpage}{253}
\indexentry{同步更新|hyperpage}{254}
\indexentry{Synchronous Update|hyperpage}{254}
\indexentry{异步更新|hyperpage}{254}
\indexentry{Asynchronous Update|hyperpage}{254}
\indexentry{参数服务器|hyperpage}{254}
\indexentry{Parameter Server|hyperpage}{254}
\indexentry{梯度消失|hyperpage}{255}
\indexentry{Gradient Vanishing|hyperpage}{255}
\indexentry{梯度爆炸|hyperpage}{255}
\indexentry{Gradient Explosion|hyperpage}{255}
\indexentry{梯度裁剪|hyperpage}{256}
\indexentry{Gradient Clipping|hyperpage}{256}
\indexentry{批量归一化|hyperpage}{257}
\indexentry{Batch Normalization|hyperpage}{257}
\indexentry{层归一化|hyperpage}{257}
\indexentry{Layer Normalization|hyperpage}{257}
\indexentry{残差网络|hyperpage}{257}
\indexentry{Residual Networks|hyperpage}{257}
\indexentry{跳接|hyperpage}{257}
\indexentry{Shortcut Connection|hyperpage}{257}
\indexentry{过拟合|hyperpage}{258}
\indexentry{Overfitting|hyperpage}{258}
\indexentry{正则化|hyperpage}{258}
\indexentry{Regularization|hyperpage}{258}
\indexentry{反向传播|hyperpage}{259}
\indexentry{back propagation|hyperpage}{259}
\indexentry{神经语言模型|hyperpage}{265}
\indexentry{Neural Language Model|hyperpage}{265}
\indexentry{前馈神经网络语言模型|hyperpage}{266}
\indexentry{Feed-forward Neural Network Language Model|hyperpage}{266}
\indexentry{循环神经网络|hyperpage}{268}
\indexentry{Recurrent Neural Network|hyperpage}{268}
\indexentry{循环神经网络语言模型|hyperpage}{268}
\indexentry{RNNLM|hyperpage}{268}
\indexentry{循环单元|hyperpage}{268}
\indexentry{RNN Cell|hyperpage}{268}
\indexentry{自注意力机制|hyperpage}{270}
\indexentry{Self-Attention Mechanism|hyperpage}{270}
\indexentry{注意力权重|hyperpage}{270}
\indexentry{Attention Weight|hyperpage}{270}
\indexentry{困惑度|hyperpage}{271}
\indexentry{Perplexity|hyperpage}{271}
\indexentry{One-hot编码|hyperpage}{271}
\indexentry{独热编码|hyperpage}{271}
\indexentry{分布式表示|hyperpage}{272}
\indexentry{Distributed Representation|hyperpage}{272}
\indexentry{词嵌入|hyperpage}{272}
\indexentry{Word Embedding|hyperpage}{272}
\indexentry{句子表示模型|hyperpage}{274}
\indexentry{句子的表示|hyperpage}{274}
\indexentry{表示学习|hyperpage}{274}
\indexentry{Representation Learning|hyperpage}{274}
\indexentry{可解释机器学习|hyperpage}{278}
\indexentry{Explainable Machine Learning|hyperpage}{278}
\indexentry{神经机器翻译|hyperpage}{281}
\indexentry{Neural Machine Translation|hyperpage}{281}
\indexentry{分布式表示|hyperpage}{283}
\indexentry{Distributed Representation|hyperpage}{283}
\indexentry{特征工程|hyperpage}{289}
\indexentry{Feature Engineering|hyperpage}{289}
\indexentry{编码器-解码器模型|hyperpage}{290}
\indexentry{Encoder-Decoder Paradigm|hyperpage}{290}
\indexentry{编码器-解码器框架|hyperpage}{290}
\indexentry{循环神经网络|hyperpage}{295}
\indexentry{Recurrent Neural Network, RNN|hyperpage}{295}
\indexentry{词嵌入|hyperpage}{297}
\indexentry{Word Embedding|hyperpage}{297}
\indexentry{表示学习|hyperpage}{297}
\indexentry{Representation Learning|hyperpage}{297}
\indexentry{生成|hyperpage}{297}
\indexentry{Generation|hyperpage}{297}
\indexentry{长短时记忆|hyperpage}{302}
\indexentry{Long Short-Term Memory|hyperpage}{302}
\indexentry{遗忘|hyperpage}{302}
\indexentry{记忆更新|hyperpage}{303}
\indexentry{输出|hyperpage}{303}
\indexentry{门循环单元|hyperpage}{304}
\indexentry{Gated Recurrent Unit,GRU|hyperpage}{304}
\indexentry{注意力权重|hyperpage}{309}
\indexentry{Attention Weight|hyperpage}{309}
\indexentry{一阶矩估计|hyperpage}{315}
\indexentry{First Moment Estimation|hyperpage}{315}
\indexentry{二阶矩估计|hyperpage}{315}
\indexentry{Second Moment Estimation|hyperpage}{315}
\indexentry{学习率|hyperpage}{316}
\indexentry{Learning Rate|hyperpage}{316}
\indexentry{逐渐预热|hyperpage}{316}
\indexentry{Gradual Warmup|hyperpage}{316}
\indexentry{分段常数衰减|hyperpage}{317}
\indexentry{Piecewise Constant Decay|hyperpage}{317}
\indexentry{数据并行|hyperpage}{318}
\indexentry{模型并行|hyperpage}{318}
\indexentry{全搜索|hyperpage}{320}
\indexentry{Full Search|hyperpage}{320}
\indexentry{贪婪搜索|hyperpage}{320}
\indexentry{Greedy Search|hyperpage}{320}
\indexentry{束搜索|hyperpage}{320}
\indexentry{Beam Search|hyperpage}{320}
\indexentry{自回归模型|hyperpage}{320}
\indexentry{Autoregressive Model|hyperpage}{321}
\indexentry{非自回归模型|hyperpage}{321}
\indexentry{Non-autoregressive Model|hyperpage}{321}
\indexentry{自注意力机制|hyperpage}{326}
\indexentry{Self-Attention|hyperpage}{326}
\indexentry{特征提取|hyperpage}{327}
\indexentry{自注意力子层|hyperpage}{328}
\indexentry{Self-attention Sub-layer|hyperpage}{328}
\indexentry{前馈神经网络子层|hyperpage}{328}
\indexentry{Feed-forward Sub-layer|hyperpage}{328}
\indexentry{残差连接|hyperpage}{328}
\indexentry{Residual Connection|hyperpage}{328}
\indexentry{层正则化|hyperpage}{328}
\indexentry{Layer Normalization|hyperpage}{328}
\indexentry{编码-解码注意力子层|hyperpage}{329}
\indexentry{Encoder-decoder Attention Sub-layer|hyperpage}{329}
\indexentry{词嵌入|hyperpage}{329}
\indexentry{Word Embedding|hyperpage}{329}
\indexentry{位置编码|hyperpage}{329}
\indexentry{Position Embedding|hyperpage}{329}
\indexentry{点乘注意力|hyperpage}{333}
\indexentry{Scaled Dot-Product Attention|hyperpage}{333}
\indexentry{多头注意力|hyperpage}{335}
\indexentry{Multi-head Attention|hyperpage}{335}
\indexentry{残差连接|hyperpage}{336}
\indexentry{短连接|hyperpage}{337}
\indexentry{Short-cut Connection|hyperpage}{337}
\indexentry{后正则化|hyperpage}{338}
\indexentry{Post-norm|hyperpage}{338}
\indexentry{前正则化|hyperpage}{338}
\indexentry{Pre-norm|hyperpage}{338}
\indexentry{交叉熵损失|hyperpage}{339}
\indexentry{Cross Entropy Loss|hyperpage}{339}
\indexentry{预热|hyperpage}{339}
\indexentry{Warmup|hyperpage}{339}
\indexentry{小批量训练|hyperpage}{340}
\indexentry{Mini-batch Training|hyperpage}{340}
\indexentry{Dropout|hyperpage}{340}
\indexentry{过拟合|hyperpage}{340}
\indexentry{Over fitting|hyperpage}{340}
\indexentry{标签平滑|hyperpage}{340}
\indexentry{Label Smoothing|hyperpage}{340}
\indexentry{序列到序列的转换/生成问题|hyperpage}{342}
\indexentry{Sequence-to-Sequence Problem|hyperpage}{342}
\indexentry{未登录词|hyperpage}{353}
\indexentry{Out of Vocabulary Word,OOV Word|hyperpage}{353}
\indexentry{子词切分|hyperpage}{353}
\indexentry{Sub-word Segmentation|hyperpage}{353}
\indexentry{标准化|hyperpage}{353}
\indexentry{Normalization|hyperpage}{353}
\indexentry{数据清洗|hyperpage}{353}
\indexentry{Dada Cleaning|hyperpage}{353}
\indexentry{数据选择|hyperpage}{355}
\indexentry{Data Selection|hyperpage}{355}
\indexentry{数据过滤|hyperpage}{355}
\indexentry{Data Filtering|hyperpage}{355}
\indexentry{开放词表|hyperpage}{358}
\indexentry{Open-Vocabulary|hyperpage}{358}
\indexentry{子词|hyperpage}{359}
\indexentry{Sub-word|hyperpage}{359}
\indexentry{字节对编码|hyperpage}{359}
\indexentry{双字节编码|hyperpage}{359}
\indexentry{Byte Pair Encoding,BPE|hyperpage}{359}
\indexentry{正则化|hyperpage}{362}
\indexentry{Regularization|hyperpage}{362}
\indexentry{过拟合问题|hyperpage}{362}
\indexentry{Overfitting Problem|hyperpage}{362}
\indexentry{反问题|hyperpage}{362}
\indexentry{Inverse Problem|hyperpage}{362}
\indexentry{适定的|hyperpage}{363}
\indexentry{Well-posed|hyperpage}{363}
\indexentry{不适定问题|hyperpage}{363}
\indexentry{Ill-posed Problem|hyperpage}{363}
\indexentry{降噪|hyperpage}{363}
\indexentry{Denoising|hyperpage}{363}
\indexentry{泛化|hyperpage}{364}
\indexentry{Generalization|hyperpage}{364}
\indexentry{标签平滑|hyperpage}{365}
\indexentry{Label Smoothing|hyperpage}{365}
\indexentry{相互适应|hyperpage}{366}
\indexentry{Co-Adaptation|hyperpage}{366}
\indexentry{集成学习|hyperpage}{368}
\indexentry{Ensemble Learning|hyperpage}{368}
\indexentry{容量|hyperpage}{369}
\indexentry{Capacity|hyperpage}{369}
\indexentry{宽残差网络|hyperpage}{369}
\indexentry{Wide Residual Network|hyperpage}{369}
\indexentry{探测任务|hyperpage}{371}
\indexentry{Probing Task|hyperpage}{371}
\indexentry{表面信息|hyperpage}{371}
\indexentry{Surface Information|hyperpage}{371}
\indexentry{语法信息|hyperpage}{371}
\indexentry{Syntactic Information|hyperpage}{371}
\indexentry{语义信息|hyperpage}{371}
\indexentry{Semantic Information|hyperpage}{371}
\indexentry{词嵌入|hyperpage}{371}
\indexentry{Embedding|hyperpage}{371}
\indexentry{数据并行|hyperpage}{372}
\indexentry{Data Parallelism|hyperpage}{372}
\indexentry{模型并行|hyperpage}{372}
\indexentry{Model Parallelism|hyperpage}{372}
\indexentry{小批量训练|hyperpage}{372}
\indexentry{Mini-batch Training|hyperpage}{372}
\indexentry{课程学习|hyperpage}{374}
\indexentry{Curriculum Learning|hyperpage}{374}
\indexentry{推断|hyperpage}{375}
\indexentry{Inference|hyperpage}{375}
\indexentry{解码|hyperpage}{375}
\indexentry{Decoding|hyperpage}{375}
\indexentry{准确性|hyperpage}{375}
\indexentry{Accuracy|hyperpage}{375}
\indexentry{时延|hyperpage}{375}
\indexentry{Latency|hyperpage}{375}
\indexentry{时延|hyperpage}{375}
\indexentry{Memory|hyperpage}{375}
\indexentry{搜索错误|hyperpage}{375}
\indexentry{Search Error|hyperpage}{375}
\indexentry{模型错误|hyperpage}{375}
\indexentry{Modeling Error|hyperpage}{375}
\indexentry{重排序|hyperpage}{377}
\indexentry{Re-ranking|hyperpage}{377}
\indexentry{双向推断|hyperpage}{377}
\indexentry{Bidirectional Inference|hyperpage}{377}
\indexentry{批量推断|hyperpage}{381}
\indexentry{Batch Inference|hyperpage}{381}
\indexentry{批量处理|hyperpage}{381}
\indexentry{Batching|hyperpage}{381}
\indexentry{二值网络|hyperpage}{383}
\indexentry{Binarized Neural Networks|hyperpage}{383}
\indexentry{自回归翻译|hyperpage}{383}
\indexentry{Autoregressive Translation|hyperpage}{383}
\indexentry{非自回归翻译|hyperpage}{383}
\indexentry{Regressive Translation|hyperpage}{383}
\indexentry{繁衍率|hyperpage}{383}
\indexentry{Fertility|hyperpage}{383}
\indexentry{偏置|hyperpage}{385}
\indexentry{Bias|hyperpage}{385}
\indexentry{退化|hyperpage}{385}
\indexentry{Degenerate|hyperpage}{385}
\indexentry{过翻译|hyperpage}{386}
\indexentry{Over Translation|hyperpage}{386}
\indexentry{欠翻译|hyperpage}{386}
\indexentry{Under Translation|hyperpage}{386}
\indexentry{充分性|hyperpage}{387}
\indexentry{Adequacy|hyperpage}{387}
\indexentry{系统融合|hyperpage}{388}
\indexentry{System Combination|hyperpage}{388}
\indexentry{假设选择|hyperpage}{388}
\indexentry{Hypothesis Selection|hyperpage}{388}
\indexentry{多样性|hyperpage}{388}
\indexentry{Diversity|hyperpage}{388}
\indexentry{重排序|hyperpage}{389}
\indexentry{Re-ranking|hyperpage}{389}
\indexentry{混淆网络|hyperpage}{390}
\indexentry{Confusion Network|hyperpage}{390}
\indexentry{动态线性层聚合方法|hyperpage}{394}
\indexentry{Dynamic Linear Combination of Layers,DLCL|hyperpage}{394}
\indexentry{相互适应|hyperpage}{398}
\indexentry{Co-adaptation|hyperpage}{398}
\indexentry{数据增强|hyperpage}{401}
\indexentry{Data Augmentation|hyperpage}{401}
\indexentry{回译|hyperpage}{401}
\indexentry{Back Translation|hyperpage}{401}
\indexentry{迭代式回译|hyperpage}{401}
\indexentry{Iterative Back Translation|hyperpage}{401}
\indexentry{前向翻译|hyperpage}{402}
\indexentry{Forward Translation|hyperpage}{402}
\indexentry{预训练|hyperpage}{402}
\indexentry{Pre-training|hyperpage}{402}
\indexentry{微调|hyperpage}{402}
\indexentry{Fine-tuning|hyperpage}{402}
\indexentry{多任务学习|hyperpage}{404}
\indexentry{Multitask Learning|hyperpage}{404}
\indexentry{模型压缩|hyperpage}{405}
\indexentry{Model Compression|hyperpage}{405}
\indexentry{学习难度|hyperpage}{405}
\indexentry{Learning Difficulty|hyperpage}{406}
\indexentry{教师模型|hyperpage}{406}
\indexentry{Teacher Model|hyperpage}{406}
\indexentry{学生模型|hyperpage}{406}
\indexentry{Student Model|hyperpage}{406}
\indexentry{基于单词的知识精炼|hyperpage}{406}
\indexentry{Word-level Knowledge Distillation|hyperpage}{406}
\indexentry{基于序列的知识精炼|hyperpage}{407}
\indexentry{Sequence-level Knowledge Distillation|hyperpage}{407}
\indexentry{中间层输出|hyperpage}{408}
\indexentry{Hint-based Knowledge Transfer|hyperpage}{408}
\indexentry{注意力分布|hyperpage}{408}
\indexentry{Attention To Attention Transfer|hyperpage}{408}
\indexentry{循环一致性|hyperpage}{410}
\indexentry{Circle Consistency|hyperpage}{410}
\indexentry{翻译中回译|hyperpage}{411}
\indexentry{On-the-fly Back-translation|hyperpage}{411}
\indexentry{网络结构搜索技术|hyperpage}{414}
\indexentry{Neural Architecture Search;NAS|hyperpage}{414}
\boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax \boolfalse {citerequest}\boolfalse {citetracker}\boolfalse {pagetracker}\boolfalse {backtracker}\relax
\babel@toc {english}{}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {I}{统计机器翻译}}{9}{part.1}% \select@language {english}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {I}{机器翻译基础}}{15}{part.1}
\ttl@starttoc {default@1} \ttl@starttoc {default@1}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {1}基于词的机器翻译模型}{11}{chapter.1}% \contentsline {chapter}{\numberline {1}机器翻译简介}{17}{chapter.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.1}机器翻译的概念}{17}{section.1.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.2}机器翻译简史}{20}{section.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.1}人工翻译}{20}{subsection.1.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.2}机器翻译的萌芽}{21}{subsection.1.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.3}机器翻译的受挫}{22}{subsection.1.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.4}机器翻译的快速成长}{23}{subsection.1.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.5}机器翻译的爆发}{24}{subsection.1.2.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.3}机器翻译现状}{25}{section.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.4}机器翻译方法}{27}{section.1.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.1}基于规则的机器翻译}{27}{subsection.1.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.2}基于实例的机器翻译}{28}{subsection.1.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.3}统计机器翻译}{29}{subsection.1.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.4}神经机器翻译}{30}{subsection.1.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.5}对比分析}{31}{subsection.1.4.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.5}翻译质量评价}{32}{section.1.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.1}人工评价}{32}{subsection.1.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.2}自动评价}{33}{subsection.1.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{BLEU}{33}{section*.17}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{TER}{35}{section*.18}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于检测点的评价}{35}{section*.19}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.6}机器翻译应用}{36}{section.1.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.7}开源项目与评测}{38}{section.1.7}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.1}开源机器翻译系统}{38}{subsection.1.7.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{统计机器翻译开源系统}{39}{section*.21}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经机器翻译开源系统}{40}{section*.22}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.7.2}常用数据集及公开评测任务}{42}{subsection.1.7.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.8}推荐学习资源}{44}{section.1.8}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {2}词法、语法及统计建模基础}{49}{chapter.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.1}问题概述 }{50}{section.2.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.2}概率论基础}{51}{section.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.1}随机变量和概率}{52}{subsection.2.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.2}联合概率、条件概率和边缘概率}{53}{subsection.2.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.3}链式法则}{54}{subsection.2.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.4}贝叶斯法则}{55}{subsection.2.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.2.5}KL距离和熵}{57}{subsection.2.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{信息熵}{57}{section*.29}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{KL距离}{58}{section*.31}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{交叉熵}{58}{section*.32}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.3}中文分词}{59}{section.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.1}基于词典的分词方法}{60}{subsection.2.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.3.2}基于统计的分词方法}{61}{subsection.2.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{统计模型的学习与推断}{61}{section*.36}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{掷骰子游戏}{62}{section*.38}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{全概率分词方法}{64}{section*.42}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.4}$n$-gram语言模型 }{66}{section.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.4.1}建模}{67}{subsection.2.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.4.2}未登录词和平滑算法}{69}{subsection.2.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{加法平滑方法}{70}{section*.48}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{古德-图灵估计法}{71}{section*.50}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Kneser-Ney平滑方法}{72}{section*.52}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.5}句法分析(短语结构分析)}{74}{section.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.1}句子的句法树表示}{74}{subsection.2.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.2}上下文无关文法}{76}{subsection.2.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {2.5.3}规则和推导的概率}{81}{subsection.2.5.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {2.6}小结及深入阅读}{83}{section.2.6}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {II}{统计机器翻译}}{85}{part.2}
\ttl@stoptoc {default@1}
\ttl@starttoc {default@2}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {3}基于词的机器翻译模型}{87}{chapter.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.1}什么是基于词的翻译模型}{87}{section.3.1}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.2}构建一个简单的机器翻译系统}{89}{section.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.1}如何进行翻译?}{89}{subsection.3.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{机器翻译流程}{90}{section*.65}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{91}{section*.67}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.2}基本框架}{91}{subsection.3.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.3}单词翻译概率}{92}{subsection.3.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{什么是单词翻译概率?}{92}{section*.69}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从一个双语平行数据中学习?}{93}{section*.71}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从大量的双语平行数据中学习?}{94}{section*.72}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.4}句子级翻译模型}{95}{subsection.3.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基础模型}{95}{section*.74}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{生成流畅的译文}{97}{section*.76}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.2.5}解码}{99}{subsection.3.2.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.3}基于词的翻译建模}{101}{section.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.1}噪声信道模型}{101}{subsection.3.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.3.2}统计机器翻译的三个基本问题}{104}{subsection.3.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐}{104}{section*.86}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译模型}{105}{section*.89}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译实例}{107}{section*.91}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.4}IBM模型1-2}{107}{section.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.1}IBM模型1}{108}{subsection.3.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.2}IBM模型2}{109}{subsection.3.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.3}解码及计算优化}{110}{subsection.3.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.4.4}训练}{112}{subsection.3.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{目标函数}{112}{section*.96}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{优化}{113}{section*.98}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.5}IBM模型3-5及隐马尔可夫模型}{119}{section.3.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.1}基于产出率的翻译模型}{119}{subsection.3.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.2}IBM 模型3}{122}{subsection.3.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.3}IBM 模型4}{123}{subsection.3.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.4} IBM 模型5}{125}{subsection.3.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.5}隐马尔可夫模型}{126}{subsection.3.5.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{隐马尔可夫模型}{126}{section*.110}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐模型}{127}{section*.112}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.5.6}解码和训练}{128}{subsection.3.5.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.6}问题分析}{129}{section.3.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.1}词对齐及对称化}{129}{subsection.3.6.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.2}Deficiency}{130}{subsection.3.6.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.3}句子长度}{131}{subsection.3.6.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {3.6.4}其他问题}{131}{subsection.3.6.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {3.7}小结及深入阅读}{132}{section.3.7}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {4}基于短语和句法的机器翻译模型}{135}{chapter.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.1}翻译中的结构信息}{135}{section.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.1}更大粒度的翻译单元}{136}{subsection.4.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.1.2}句子的结构信息}{138}{subsection.4.1.2}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.2}基于短语的翻译模型}{140}{section.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.1}机器翻译中的短语}{140}{subsection.4.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.2}数学建模及判别式模型}{143}{subsection.4.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于翻译推导的建模}{143}{section*.124}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{对数线性模型}{144}{section*.125}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{搭建模型的基本流程}{145}{section*.126}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.3}短语抽取}{146}{subsection.4.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{与词对齐一致的短语}{147}{section*.129}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{获取词对齐}{148}{section*.133}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{度量双语短语质量}{149}{section*.135}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.4}调序}{150}{subsection.4.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于距离的调序}{151}{section*.139}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于方向的调序}{151}{section*.141}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于分类的调序}{152}{section*.144}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.5}特征}{153}{subsection.4.2.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.6}最小错误率训练}{154}{subsection.4.2.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.2.7}栈解码}{157}{subsection.4.2.7}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译候选匹配}{158}{section*.149}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译假设扩展}{159}{section*.151}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{剪枝}{160}{section*.153}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{解码中的栈结构}{161}{section*.155}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.3}基于层次短语的模型}{162}{section.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.1}同步上下文无关文法}{164}{subsection.4.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{文法定义}{165}{section*.160}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{推导}{166}{section*.161}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{胶水规则}{167}{section*.162}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{处理流程}{168}{section*.163}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.2}层次短语规则抽取}{168}{subsection.4.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.3}翻译模型及特征}{170}{subsection.4.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.4}CKY解码}{171}{subsection.4.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.3.5}立方剪枝}{174}{subsection.4.3.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.4}基于语言学句法的模型}{177}{section.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.1}基于句法的翻译模型分类}{179}{subsection.4.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.2}基于树结构的文法}{179}{subsection.4.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树到树翻译规则}{182}{section*.179}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于树结构的翻译推导}{183}{section*.181}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树到串翻译规则}{185}{section*.184}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.3}树到串翻译规则抽取}{186}{subsection.4.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{树的切割与最小规则}{186}{section*.186}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{空对齐处理}{190}{section*.192}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{组合规则}{191}{section*.194}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{SPMT规则}{191}{section*.196}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{句法树二叉化}{192}{section*.198}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.4}树到树翻译规则抽取}{194}{subsection.4.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于节点对齐的规则抽取}{195}{section*.202}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于对齐矩阵的规则抽取}{195}{section*.205}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.5}句法翻译模型的特征}{196}{subsection.4.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.6}基于超图的推导空间表示}{199}{subsection.4.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {4.4.7}基于树的解码 vs 基于串的解码}{201}{subsection.4.4.7}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于树的解码}{202}{section*.213}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于串的解码}{204}{section*.216}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {4.5}小结及深入阅读}{206}{section.4.5}
\defcounter {refsection}{0}\relax
\contentsline {part}{\@mypartnumtocformat {III}{神经机器翻译}}{209}{part.3}
\ttl@stoptoc {default@2}
\ttl@starttoc {default@3}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {5}人工神经网络和神经语言建模}{211}{chapter.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.1}深度学习与人工神经网络}{212}{section.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.1}发展简史}{212}{subsection.5.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{早期的人工神经网络和第一次寒冬}{212}{section*.218}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经网络的第二次高潮和第二次寒冬}{213}{section*.219}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深度学习和神经网络方法的崛起}{214}{section*.220}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.1.2}为什么需要深度学习}{215}{subsection.5.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{端到端学习和表示学习}{215}{section*.222}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深度学习的效果}{216}{section*.224}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.2}神经网络基础}{216}{section.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.1}线性代数基础}{216}{subsection.5.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{标量、向量和矩阵}{217}{section*.226}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵的转置}{218}{section*.227}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵加法和数乘}{218}{section*.228}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{矩阵乘法和矩阵点乘}{219}{section*.229}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{线性映射}{220}{section*.230}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{范数}{221}{section*.231}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.2}人工神经元和感知机}{222}{subsection.5.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{感知机\ \raisebox {0.5mm}{------}\ 最简单的人工神经元模型}{223}{section*.234}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元内部权重}{224}{section*.237}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元的输入\ \raisebox {0.5mm}{------}\ 离散 vs 连续}{225}{section*.239}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{神经元内部的参数学习}{225}{section*.241}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.3}多层神经网络}{226}{subsection.5.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{线性变换和激活函数}{226}{section*.243}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{单层神经网络$\rightarrow $多层神经网络}{229}{section*.250}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.2.4}函数拟合能力}{230}{subsection.5.2.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.3}神经网络的张量实现}{233}{section.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.1} 张量及其计算}{233}{subsection.5.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{\ 张量}{233}{section*.259}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{张量的矩阵乘法}{235}{section*.262}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{张量的单元操作}{236}{section*.264}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.2}张量的物理存储形式}{237}{subsection.5.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.3}使用开源框架实现张量计算}{238}{subsection.5.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.4}前向传播与计算图}{241}{subsection.5.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.3.5}神经网络实例}{242}{subsection.5.3.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.4}神经网络的参数训练}{243}{section.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.1}损失函数}{244}{subsection.5.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.2}基于梯度的参数优化}{245}{subsection.5.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度下降}{246}{section*.278}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度获取}{248}{section*.280}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于梯度的方法的变种和改进}{250}{section*.284}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.3}参数更新的并行化策略}{253}{subsection.5.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.4}梯度消失、梯度爆炸和稳定性训练}{255}{subsection.5.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{易于优化的激活函数}{255}{section*.287}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度裁剪}{256}{section*.291}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{稳定性训练}{257}{section*.292}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.5}过拟合}{258}{subsection.5.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.4.6}反向传播}{259}{subsection.5.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{输出层的反向传播}{260}{section*.295}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{隐藏层的反向传播}{262}{section*.299}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{程序实现}{264}{section*.302}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.5}神经语言模型}{265}{section.5.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.1}基于神经网络的语言建模}{265}{subsection.5.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于前馈神经网络的语言模型}{266}{section*.305}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于循环神经网络的语言模型}{268}{section*.308}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于自注意力机制的语言模型}{269}{section*.310}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{语言模型的评价}{271}{section*.312}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.2}单词表示模型}{271}{subsection.5.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{One-hot编码}{271}{section*.313}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{分布式表示}{272}{section*.315}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {5.5.3}句子表示模型及预训练}{273}{subsection.5.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{简单的上下文表示模型}{274}{section*.319}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{ELMO模型}{275}{section*.322}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{GPT模型}{275}{section*.324}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{BERT模型}{276}{section*.326}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{为什么要预训练?}{277}{section*.328}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {5.6}小结及深入阅读}{278}{section.5.6}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {6}神经机器翻译模型}{281}{chapter.6}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.1}神经机器翻译的发展简史}{281}{section.6.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.1}神经机器翻译的起源}{283}{subsection.6.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.2}神经机器翻译的品质 }{285}{subsection.6.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.1.3}神经机器翻译的优势 }{288}{subsection.6.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.2}编码器-解码器框架}{290}{section.6.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.1}框架结构}{290}{subsection.6.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.2}表示学习}{291}{subsection.6.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.3}简单的运行实例}{292}{subsection.6.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.2.4}机器翻译范式的对比}{293}{subsection.6.2.4}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.3}基于循环神经网络的翻译模型及注意力机制}{294}{section.6.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.1}建模}{295}{subsection.6.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.2}输入(词嵌入)及输出(Softmax)}{298}{subsection.6.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.3}循环神经网络结构}{301}{subsection.6.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{循环神经单元(RNN)}{301}{section*.351}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长短时记忆网络(LSTM)}{302}{section*.352}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{门控循环单元(GRU)}{304}{section*.355}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{双向模型}{305}{section*.357}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{多层循环神经网络}{306}{section*.359}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.4}注意力机制}{306}{subsection.6.3.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{翻译中的注意力机制}{307}{section*.362}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{上下文向量的计算}{309}{section*.365}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{注意力机制的解读}{312}{section*.370}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.5}训练}{313}{subsection.6.3.5}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{损失函数}{314}{section*.373}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{参数初始化}{314}{section*.374}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{优化策略}{315}{section*.375}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{梯度裁剪}{315}{section*.377}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{学习率策略}{316}{section*.378}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{并行训练}{317}{section*.381}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.6}推断}{320}{subsection.6.3.6}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{贪婪搜索}{321}{section*.386}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{束搜索}{321}{section*.389}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{长度惩罚}{323}{section*.391}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.3.7}实例-GNMT}{324}{subsection.6.3.7}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.4}Transformer}{324}{section.6.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.1}自注意力模型}{326}{subsection.6.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.2}Transformer架构}{328}{subsection.6.4.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.3}位置编码}{330}{subsection.6.4.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.4}基于点乘的注意力机制}{332}{subsection.6.4.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.5}掩码操作}{334}{subsection.6.4.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.6}多头注意力}{335}{subsection.6.4.6}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.7}残差网络和层正则化}{336}{subsection.6.4.7}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.8}前馈全连接网络子层}{338}{subsection.6.4.8}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.9}训练}{339}{subsection.6.4.9}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.4.10}推断}{341}{subsection.6.4.10}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.5}序列到序列问题及应用}{342}{section.6.5}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.1}自动问答}{342}{subsection.6.5.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.2}自动文摘}{343}{subsection.6.5.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.3}文言文翻译}{343}{subsection.6.5.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.4}对联生成}{344}{subsection.6.5.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {6.5.5}古诗生成}{344}{subsection.6.5.5}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {6.6}小结及深入阅读}{346}{section.6.6}
\defcounter {refsection}{0}\relax
\contentsline {chapter}{\numberline {7}神经机器翻译实战 \ \raisebox {0.5mm}{------}\ 参加一次比赛}{349}{chapter.7}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.1}神经机器翻译并不简单}{349}{section.7.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.1}影响神经机器翻译性能的因素}{350}{subsection.7.1.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.2}搭建神经机器翻译系统的步骤 }{351}{subsection.7.1.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.1.3}架构选择 }{352}{subsection.7.1.3}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.2}数据处理}{352}{section.7.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.1}分词}{353}{subsection.7.2.1}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.2}标准化}{354}{subsection.7.2.2}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.3}数据清洗}{355}{subsection.7.2.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.2.4}子词切分}{357}{subsection.7.2.4}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{大词表和OOV问题}{358}{section*.428}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{子词}{358}{section*.430}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{双字节编码(BPE)}{359}{section*.432}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{其他方法}{362}{section*.435}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.3}建模与训练}{362}{section.7.3}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.1}正则化}{362}{subsection.7.3.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{L1/L2正则化}{364}{section*.437}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{标签平滑}{365}{section*.438}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Dropout}{366}{section*.440}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{Layer Dropout}{368}{section*.443}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.2}增大模型容量}{369}{subsection.7.3.2}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{宽网络}{369}{section*.445}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{深网络}{370}{section*.447}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{增大输入层和输出层表示能力}{371}{section*.449}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{大模型的分布式计算}{372}{section*.450}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.3.3}大批量训练}{372}{subsection.7.3.3}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{为什么需要大批量训练}{372}{section*.451}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何构建批次}{373}{section*.454}
\defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {7.4}推断}{375}{section.7.4}
\defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {7.4.1}推断优化}{375}{subsection.7.4.1}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{推断系统的架构}{375}{section*.456}
\defcounter {refsection}{0}\relax
\contentsline {subsubsection}{自左向右推断 vs 自右向左推断}{376}{section*.458}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.1}什么是基于词的翻译模型}{11}{section.1.1}% \contentsline {subsubsection}{推断加速}{377}{section*.459}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.2}构建一个简单的机器翻译系统}{13}{section.1.2}% \contentsline {subsection}{\numberline {7.4.2}译文长度控制}{384}{subsection.7.4.2}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.1}如何进行翻译?}{13}{subsection.1.2.1}% \contentsline {subsubsection}{长度惩罚因子}{385}{section*.465}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{机器翻译流程}{14}{section*.7}% \contentsline {subsubsection}{译文长度范围约束}{386}{section*.467}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{人工翻译 vs. 机器翻译}{15}{section*.9}% \contentsline {subsubsection}{覆盖度模型}{386}{section*.468}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.2}基本框架}{15}{subsection.1.2.2}% \contentsline {subsection}{\numberline {7.4.3}多模型集成}{387}{subsection.7.4.3}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.3}单词翻译概率}{16}{subsection.1.2.3}% \contentsline {subsubsection}{假设选择}{388}{section*.469}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{什么是单词翻译概率?}{16}{section*.11}% \contentsline {subsubsection}{局部预测融合}{389}{section*.471}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从一个双语平行数据中学习?}{17}{section*.13}% \contentsline {subsubsection}{译文重组}{390}{section*.473}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{如何从大量的双语平行数据中学习?}{18}{section*.14}% \contentsline {section}{\numberline {7.5}进阶技术}{391}{section.7.5}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.4}句子级翻译模型}{19}{subsection.1.2.4}% \contentsline {subsection}{\numberline {7.5.1}深层模型}{391}{subsection.7.5.1}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基础模型}{19}{section*.16}% \contentsline {subsubsection}{Post-Norm vs Pre-Norm}{392}{section*.476}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{生成流畅的译文}{21}{section*.18}% \contentsline {subsubsection}{层聚合}{394}{section*.479}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.2.5}解码}{23}{subsection.1.2.5}% \contentsline {subsubsection}{深层模型的训练加速}{395}{section*.481}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.3}基于词的翻译建模}{26}{section.1.3}% \contentsline {subsubsection}{渐进式训练}{395}{section*.482}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.3.1}噪声信道模型}{26}{subsection.1.3.1}% \contentsline {subsubsection}{分组稠密连接}{396}{section*.484}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.3.2}统计机器翻译的三个基本问题}{28}{subsection.1.3.2}% \contentsline {subsubsection}{学习率重置策略}{397}{section*.486}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐}{29}{section*.27}% \contentsline {subsubsection}{深层模型的鲁棒性训练}{398}{section*.488}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译模型}{30}{section*.30}% \contentsline {subsection}{\numberline {7.5.2}单语数据的使用}{400}{subsection.7.5.2}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{基于词对齐的翻译实例}{31}{section*.32}% \contentsline {subsubsection}{伪数据}{401}{section*.492}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.4}IBM模型1-2}{32}{section.1.4}% \contentsline {subsubsection}{预训练}{402}{section*.495}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.1}IBM模型1}{32}{subsection.1.4.1}% \contentsline {subsubsection}{联合训练}{404}{section*.498}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.2}IBM模型2}{34}{subsection.1.4.2}% \contentsline {subsection}{\numberline {7.5.3}知识精炼}{404}{subsection.7.5.3}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.3}解码及计算优化}{35}{subsection.1.4.3}% \contentsline {subsubsection}{什么是知识精炼}{405}{section*.500}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.4.4}训练}{36}{subsection.1.4.4}% \contentsline {subsubsection}{知识精炼的基本方法}{406}{section*.501}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{目标函数}{36}{section*.37}% \contentsline {subsubsection}{机器翻译中的知识精炼}{408}{section*.503}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{优化}{37}{section*.39}% \contentsline {subsection}{\numberline {7.5.4}双向训练}{408}{subsection.7.5.4}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.5}IBM模型3-5及隐马尔可夫模型}{42}{section.1.5}% \contentsline {subsubsection}{有监督对偶学习}{410}{section*.505}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.1}基于产出率的翻译模型}{44}{subsection.1.5.1}% \contentsline {subsubsection}{无监督对偶学习}{410}{section*.506}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.2}IBM 模型3}{46}{subsection.1.5.2}% \contentsline {subsubsection}{翻译中回译}{411}{section*.508}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.3}IBM 模型4}{48}{subsection.1.5.3}% \contentsline {section}{\numberline {7.6}小结及深入阅读}{412}{section.7.6}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.4} IBM 模型5}{49}{subsection.1.5.4}% \contentsline {part}{\@mypartnumtocformat {IV}{附录}}{417}{part.4}
\ttl@stoptoc {default@3}
\ttl@starttoc {default@4}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.5}隐马尔可夫模型}{51}{subsection.1.5.5}% \contentsline {chapter}{\numberline {A}附录A}{419}{Appendix.1.A}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{隐马尔可夫模型}{51}{section*.51}% \contentsline {section}{\numberline {A.1}基准数据集}{419}{section.1.A.1}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsubsection}{词对齐模型}{52}{section*.53}% \contentsline {section}{\numberline {A.2}平行语料}{420}{section.1.A.2}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.5.6}解码和训练}{53}{subsection.1.5.6}% \contentsline {section}{\numberline {A.3}相关工具}{421}{section.1.A.3}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.6}问题分析}{54}{section.1.6}% \contentsline {subsection}{\numberline {A.3.1}数据预处理工具}{421}{subsection.1.A.3.1}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.6.1}词对齐及对称化}{54}{subsection.1.6.1}% \contentsline {subsection}{\numberline {A.3.2}评价工具}{422}{subsection.1.A.3.2}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.6.2}Deficiency}{55}{subsection.1.6.2}% \contentsline {chapter}{\numberline {B}附录B}{423}{Appendix.2.B}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.6.3}句子长度}{56}{subsection.1.6.3}% \contentsline {section}{\numberline {B.1}IBM模型3训练方法}{423}{section.2.B.1}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {subsection}{\numberline {1.6.4}其他问题}{56}{subsection.1.6.4}% \contentsline {section}{\numberline {B.2}IBM模型4训练方法}{425}{section.2.B.2}
\defcounter {refsection}{0}\relax \defcounter {refsection}{0}\relax
\contentsline {section}{\numberline {1.7}小结及深入阅读}{57}{section.1.7}% \contentsline {section}{\numberline {B.3}IBM模型5训练方法}{427}{section.2.B.3}
\contentsfinish \contentsfinish
% !Mode:: "TeX:UTF-8" % !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode % !TEX encoding = UTF-8 Unicode
%----------------------------------------------------------------------------------------
% 机器翻译:统计建模与深度学习方法
% Machine Translation: Statistical Modeling and Deep Learning Methods
%
% Copyright 2020
% 肖桐(xiaotong@mail.neu.edu.cn) 朱靖波 (zhujingbo@mail.neu.edu.cn)
%----------------------------------------------------------------------------------------
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% BASIC CONFIGURATIONS % BASIC CONFIGURATIONS
...@@ -122,14 +129,14 @@ ...@@ -122,14 +129,14 @@
% CHAPTERS % CHAPTERS
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
%\include{Chapter1/chapter1} \include{Chapter1/chapter1}
%\include{Chapter2/chapter2} \include{Chapter2/chapter2}
%\include{Chapter3/chapter3} \include{Chapter3/chapter3}
\include{Chapter4/chapter4} \include{Chapter4/chapter4}
%\include{Chapter5/chapter5} \include{Chapter5/chapter5}
%\include{Chapter6/chapter6} \include{Chapter6/chapter6}
%\include{Chapter7/chapter7} \include{Chapter7/chapter7}
%\include{ChapterAppend/chapterappend} \include{ChapterAppend/chapterappend}
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论