Commit 2828f8df by 孟霞

合并分支 'master' 到 'mengxia'

Master

查看合并请求 !1067
parents 1643d36b badf7cdd
......@@ -8,7 +8,7 @@
};
\node[font=\footnotesize,anchor=north] (l1) at ([xshift=0em,yshift=-1em]top.south) {(a) 符号合并表};
\node[font=\scriptsize,anchor=west] (n1) at ([xshift=-4.5em,yshift=-6em]top.west) {l\ o\ w\ e\ r\ $<$e$>$};
\node[font=\scriptsize,anchor=west] (n1) at ([xshift=-3em,yshift=-6em]top.west) {l\ o\ w\ e\ r\ $<$e$>$};
\node[font=\scriptsize,anchor=west] (n2) at ([xshift=2.6em]n1.east) {l\ o\ w\ e\ {\red r$<$e$>$}};
\node[font=\scriptsize,anchor=west] (n3) at ([xshift=2.6em]n2.east) {{\red lo}\ w\ e\ r$<$e$>$};
\node[font=\scriptsize,anchor=west] (n4) at ([xshift=2.6em]n3.east) {{\red low}\ e\ r$<$e$>$};
......@@ -24,7 +24,7 @@
\node[font=\scriptsize,anchor=north east] (s1) at ([yshift=0.1em]n1.north west) {样例1:};
\node[font=\scriptsize,anchor=north east] (s1) at ([yshift=0.1em]t1.north west) {样例2:};
\node[font=\footnotesize,anchor=north] (l2) at ([xshift=2em,yshift=-1em]t3.south) {(b) 合并样例};
\node[font=\footnotesize,anchor=north] (l2) at ([xshift=0.5em,yshift=-1em]t3.south) {(b) 合并样例};
\draw[->,thick](n1.east) -- (n2.west);
\draw[->,thick](n2.east) -- (n3.west);
......
......@@ -38,7 +38,7 @@
\end{scope}
\begin{scope}[xshift=1.85in]
\begin{scope}[xshift=1.75in]
\node [anchor=west,stnode] (r1) at (0, 0) {第1层};
\node [anchor=south,tnode] (r2) at ([xshift=0em,yshift=1em]r1.north){第2层};
......@@ -60,11 +60,11 @@
\draw[->,thick] ([xshift=0em,yshift=0em]r3.north)--([xshift=0em,yshift=0em]r4.south);
\draw[->,thick] ([xshift=0em,yshift=0em]r4.north)--([xshift=0em,yshift=0em]output.south);
\node [anchor=north,font=\small] (label) at ([xshift=-1.5em,yshift=-0.7em]input.south) {(b)原始Transformer模型};
\node [anchor=north,font=\small] (label) at ([xshift=-1.7em,yshift=-0.7em]input.south) {(b)原始Transformer模型};
\end{scope}
\begin{scope}[xshift=3.9in]
\begin{scope}[xshift=3.85in]
\node [anchor=west,stnode] (r1) at (0, 0) {第1层};
\node [anchor=south,stnode] (r2) at ([xshift=0em,yshift=1em]r1.north){第2层};
......@@ -89,7 +89,7 @@
\draw[->,thick] ([xshift=0em,yshift=0em]wr2.east)--([xshift=0em,yshift=0em]r2.west);
\draw[->,thick] ([xshift=0em,yshift=0em]wr3.east)--([xshift=0em,yshift=0em]r4.west);
\node [anchor=north,font=\small,align=left] (label) at ([xshift=-3em,yshift=-0.7em]input.south) {(c)共享权重的\\ Transformer模型};
\node [anchor=north,font=\small] (label) at ([xshift=-3em,yshift=-0.7em]input.south) {(c)共享权重的Transformer模型};
\end{scope}
......
......@@ -14,7 +14,7 @@
\node [anchor=south west,manode] (a1) at ([xshift=0em,yshift=1em]e1.north west){Attention};
\node [anchor=south east,manode] (c1) at ([xshift=0em,yshift=1em]e1.north east){Conv};
\node [anchor=south west,ebnode] (e2) at ([xshift=0em,yshift=1em]a1.north west){Embedding};
\node [anchor=south,draw,circle,inner sep=4pt] (add1) at ([xshift=0em,yshift=0.5em]e2.north){};
\node [anchor=south,draw,circle,inner sep=4pt,thick] (add1) at ([xshift=0em,yshift=0.5em]e2.north){};
\node [anchor=south,ffnnode] (f2) at ([xshift=0em,yshift=0.5em]add1.north){FFN};
\node [anchor=south,inner sep=0mm,minimum height=1.8em] (op) at ([xshift=0em,yshift=0.5em]f2.north){output};
......@@ -29,8 +29,8 @@
\draw[->,thick] ([xshift=0em,yshift=0em]f2.north)--([xshift=0em,yshift=0.3em]op.south);
\draw[-] ([xshift=0em,yshift=0em]add1.west)--([xshift=-0em,yshift=0em]add1.east);
\draw[-] ([xshift=0em,yshift=0em]add1.south)--([xshift=-0em,yshift=-0em]add1.north);
\draw[-,thick] ([xshift=0em,yshift=0em]add1.west)--([xshift=-0em,yshift=0em]add1.east);
\draw[-,thick] ([xshift=0em,yshift=0em]add1.south)--([xshift=-0em,yshift=-0em]add1.north);
\draw[->,thick,rectangle,rounded corners=5pt] ([xshift=0em,yshift=0.5em]f1.north)--([xshift=-6em,yshift=0.5em]f1.north)--([xshift=-5.45em,yshift=0em]add1.west)--([xshift=0em,yshift=0em]add1.west);
......
......@@ -10,10 +10,10 @@
\begin{scope}[]
\node [anchor=east,circle,fill=black,inner sep = 2pt] (n1) at (-0, 0) {};
\node [anchor=west,draw,circle,inner sep=5pt] (n2) at ([xshift=13em,yshift=0em]n1.east){};
\node [anchor=west,draw,circle,inner sep=5pt,thick] (n2) at ([xshift=13em,yshift=0em]n1.east){};
\node [anchor=west,lnnode] (n3) at ([xshift=1.5em,yshift=0em]n2.east){LN};
\node [anchor=west,circle,fill=black,inner sep=2pt] (n4) at ([xshift=1.5em,yshift=0em]n3.east){};
\node [anchor=west,draw,circle,inner sep=5pt] (n5) at ([xshift=5em,yshift=0em]n4.east){};
\node [anchor=west,draw,circle,inner sep=5pt,thick] (n5) at ([xshift=5em,yshift=0em]n4.east){};
\node [anchor=west,lnnode] (n6) at ([xshift=1.5em,yshift=0em]n5.east){LN};
\node [anchor=west,manode] (a1) at ([xshift=1.5em,yshift=2em]n1.east){Multi-Head Attention};
......
......@@ -7,14 +7,14 @@
\node [anchor=east] (x1) at (-0.5em, 0) {$\mathbi{x}_l$};
\node [anchor=west,draw,fill=red!20,inner xsep=5pt,rounded corners=2pt,thick] (F1) at ([xshift=4em]x1.east){\small{$F$}};
\node [anchor=west,circle,draw,minimum size=1em] (n1) at ([xshift=4em]F1.east) {};
\node [anchor=west,circle,draw,minimum size=1em,thick] (n1) at ([xshift=4em]F1.east) {};
\node [anchor=west,draw,fill=green!20,inner xsep=5pt,rounded corners=2pt,thick] (ln1) at ([xshift=4em]n1.east){\small{\textrm{LN}}};
\node [anchor=west] (x2) at ([xshift=4em]ln1.east) {$\mathbi{x}_{l+1}$};
\node [anchor=north] (x3) at ([yshift=-5em]x1.south) {$\mathbi{x}_l$};
\node [anchor=west,draw,fill=green!20,inner xsep=5pt,rounded corners=2pt,thick] (F2) at ([xshift=4em]x3.east){\small{\textrm{LN}}};
\node [anchor=west,draw,fill=red!20,inner xsep=5pt,rounded corners=2pt,thick] (ln2) at ([xshift=4em]F2.east){\small{$F$}};
\node [anchor=west,circle,draw,,minimum size=1em] (n2) at ([xshift=4em]ln2.east){};
\node [anchor=west,circle,draw,,minimum size=1em,thick] (n2) at ([xshift=4em]ln2.east){};
\node [anchor=west] (x4) at ([xshift=4em]n2.east) {$\mathbi{x}_{l+1}$};
\draw[->, line width=1pt] ([xshift=-0.1em]x1.east)--(F1.west);
......@@ -27,10 +27,10 @@
\draw[->, line width=1pt] (n2.east)--(x4.west);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x1.north) -- ([yshift=1em]x1.north) -- ([yshift=1.4em]n1.north) -- (n1.north);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x3.north) -- ([yshift=1em]x3.north) -- ([yshift=1.4em]n2.north) -- (n2.north);
\draw[-] (n1.west)--(n1.east);
\draw[-] (n1.north)--(n1.south);
\draw[-] (n2.west)--(n2.east);
\draw[-] (n2.north)--(n2.south);
\draw[-,thick] (n1.west)--(n1.east);
\draw[-,thick] (n1.north)--(n1.south);
\draw[-,thick] (n2.west)--(n2.east);
\draw[-,thick] (n2.north)--(n2.south);
\node [anchor=south] (k1) at ([yshift=-0.1em]x1.north){};
\node [anchor=south] (k2) at ([yshift=-0.1em]x3.north){};
......
......@@ -8,11 +8,11 @@
\node [anchor=east] (x1) at (-0.5em, 0) {$\mathbi{x}_l$};
\node [anchor=west,draw,fill=red!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (ln1) at ([xshift=1em]x1.east){\small{\textrm{LN}}};
\node [anchor=west,draw,fill=green!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (f1) at ([xshift=0.6em]ln1.east){\small{$F$}};
\node [anchor=west,circle,draw,,minimum size=1em] (n1) at ([xshift=3em]f1.east){};
\node [anchor=west,circle,draw,,minimum size=1em,thick] (n1) at ([xshift=3em]f1.east){};
\node [anchor=west] (x2) at ([xshift=1em]n1.east) {$\mathbi{x}_{l+1}$};
\node [anchor=west,draw,fill=red!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (ln12) at ([xshift=1em]x2.east){\small{\textrm{LN}}};
\node [anchor=west,draw,fill=green!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (f12) at ([xshift=0.6em]ln12.east){\small{$F$}};
\node [anchor=west,circle,draw,,minimum size=1em] (n12) at ([xshift=3em]f12.east){};
\node [anchor=west,circle,draw,,minimum size=1em,thick] (n12) at ([xshift=3em]f12.east){};
\node [anchor=west] (x22) at ([xshift=1em]n12.east) {$\mathbi{x}_{l+2}$};
\node [anchor=north] (x3) at ([yshift=-5em]x1.south) {$\mathbi{x}_l$};
......@@ -20,13 +20,13 @@
\node [anchor=west,draw,fill=green!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (f2) at ([xshift=0.6em]ln2.east){\small{$F$}};
\node [anchor=west,minimum size=1em] (p1) at ([xshift=1em]f2.east){};
\node [anchor=north] (m1) at ([yshift=0.6em]p1.south){\footnotesize{\red{Mask=1}}};
\node [anchor=west,circle,draw,,minimum size=1em] (n2) at ([xshift=3em]f2.east){};
\node [anchor=west,circle,draw,,minimum size=1em,thick] (n2) at ([xshift=3em]f2.east){};
\node [anchor=west] (x4) at ([xshift=1em]n2.east) {$\mathbi{x}_{l+1}$};
\node [anchor=west,draw,fill=red!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (ln22) at ([xshift=1em]x4.east){\small{\textrm{LN}}};
\node [anchor=west,draw,fill=green!30,inner xsep=5pt,rounded corners=2pt,draw,thick] (f22) at ([xshift=0.6em]ln22.east){\small{$F$}};
\node [anchor=west,minimum size=1em] (p2) at ([xshift=1em]f22.east){};
\node [anchor=north] (m2) at ([yshift=0.6em]p2.south){\footnotesize{\red{Mask=0}}};
\node [anchor=west,circle,draw,,minimum size=1em] (n22) at ([xshift=3em]f22.east){};
\node [anchor=west,circle,draw,,minimum size=1em,thick] (n22) at ([xshift=3em]f22.east){};
\node [anchor=west] (x42) at ([xshift=1em]n22.east) {$\mathbi{x}_{l+2}$};
\draw[->, line width=1pt] ([xshift=-0.1em]x1.east)--(ln1.west);
......@@ -41,10 +41,10 @@
\draw[->, line width=1pt] (n2.east)--(x4.west);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x1.north) -- ([yshift=1em]x1.north) -- ([yshift=1.4em]n1.north) -- (n1.north);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x3.north) -- ([yshift=1em]x3.north) -- ([yshift=1.4em]n2.north) -- (n2.north);
\draw[-] (n1.west)--(n1.east);
\draw[-] (n1.north)--(n1.south);
\draw[-] (n2.west)--(n2.east);
\draw[-] (n2.north)--(n2.south);
\draw[-,thick] (n1.west)--(n1.east);
\draw[-,thick] (n1.north)--(n1.south);
\draw[-,thick] (n2.west)--(n2.east);
\draw[-,thick] (n2.north)--(n2.south);
\draw[->, line width=1pt] ([xshift=-0.1em]x2.east)--(ln12.west);
\draw[->, line width=1pt] ([xshift=-0.1em]ln12.east)--(f12.west);
......@@ -58,10 +58,10 @@
\draw[->, line width=1pt] (n22.east)--(x42.west);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x2.north) -- ([yshift=1em]x2.north) -- ([yshift=1.4em]n12.north) -- (n12.north);
\draw[->,rounded corners,line width=1pt] ([yshift=-0.2em]x4.north) -- ([yshift=1em]x4.north) -- ([yshift=1.4em]n22.north) -- (n22.north);
\draw[-] (n12.west)--(n12.east);
\draw[-] (n12.north)--(n12.south);
\draw[-] (n22.west)--(n22.east);
\draw[-] (n22.north)--(n22.south);
\draw[-,thick] (n12.west)--(n12.east);
\draw[-,thick] (n12.north)--(n12.south);
\draw[-,thick] (n22.west)--(n22.east);
\draw[-,thick] (n22.north)--(n22.south);
\node [anchor=south] (k1) at ([yshift=-0.1em]x1.north){};
\node [anchor=south] (k2) at ([yshift=-0.1em]x3.north){};
......
......@@ -44,7 +44,7 @@
\node [anchor=east,font=\small] (r1) at ([xshift=-2em,yshift=0em]box1.west) {混合RNN};
{\small
\node [anchor=south west,wnode] (l1) at ([xshift=1em,yshift=5em]r1.north west) {先序遍历句法树,得到序列:};
\node [anchor=south west,wnode] (l1) at ([xshift=0em,yshift=5em]r1.north west) {先序遍历句法树,得到序列:};
\node [anchor=north west,wnode,align=center] (l2) at ([xshift=0.5em,yshift=-0.6em]l1.north east) {S\\[0.5em]$l_1$};
\node [anchor=north west,wnode,align=center] (l3) at ([xshift=0.5em,yshift=0em]l2.north east) {NP\\[0.5em]$l_2$};
\node [anchor=north west,wnode,align=center] (l4) at ([xshift=0.5em,yshift=0em]l3.north east) {PRN\\[0.5em]$l_3$};
......
......@@ -59,7 +59,7 @@
\node [anchor=west,fill=green!20,minimum width=1.5em](d1-1) at ([xshift=-0.0em]d1.east){};
\node [anchor=west,fill=red!20,minimum width=1.5em](d2-1) at ([xshift=-0.0em]d2.east){};
\node [anchor=west,fill=yellow!20,minimum width=1.5em](d3-1) at ([xshift=-0.0em]d3.east){};
\node [anchor=north] (d4) at ([xshift=1em]d1.south) {\small{训练:}};
\node [anchor=north] (d4) at ([xshift=1.82em]d1.south) {\small{训练:}};
\node [anchor=north] (d5) at ([xshift=0.5em]d2.south) {\small{推断:}};
\draw [->,thick] ([xshift=0em]d4.east)--([xshift=1.5em]d4.east);
\draw [->,thick,dashed] ([xshift=0em]d5.east)--([xshift=1.5em]d5.east);
......
......@@ -30,7 +30,7 @@
\draw[->,thick](encoder-2.north)to(decoder_1-2.south);
\draw[->,thick](decoder_1-2.north)to(decoder_2-2.south);
\draw[->,thick](decoder_2-2.north)to(y-2.south);
\node [anchor=north,scale = 1.2](pos2) at (s-2.south) {(b) 级联码器};
\node [anchor=north,scale = 1.2](pos2) at (s-2.south) {(b) 级联码器};
%%%%%%%%%%%%%%%%%%%%%%%%联合
\node(encoder-3)[coder]at([xshift=10.0em]encoder-2.east){\large{编码器}};
\node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
......@@ -43,5 +43,5 @@
\draw[->,thick](decoder_2-3.north)to(y-3.south);
\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=-4.16em,yshift=0.7cm]encoder-3.north)--(decoder_1-3.south);
\draw[->,thick](encoder-3.north)--([yshift=0.7cm]encoder-3.north)--([xshift=4.16em,yshift=0.7cm]encoder-3.north)--(decoder_2-3.south);
\node [anchor=north,scale = 1.2](pos3) at (s-3.south) {(c) 联合编码器};
\node [anchor=north,scale = 1.2](pos3) at (s-3.south) {(c) 联合编码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -753,13 +753,13 @@ g(\seq{s},\seq{t}) & \equiv & \prod_{j,i \in \widehat{A}}{\funp{P}(s_j,t_i)} \ti
\label{eq:5-22}
\end{eqnarray}
\item 源语单词$s_j$的生成概率$\funp{P}(s_j|a_1^{j},s_1^{j-1},m,\seq{t})$仅依赖与其对齐的译文单词$t_{a_j}$,即词汇翻译概率$f(s_j|t_{a_j})$。此时词汇翻译概率满足$\sum_{s_j}{f(s_j|t_{a_j})}=1$。比如在图\ref{fig:5-18}表示的例子中,源语单词“上”出现的概率只和与它对齐的单词“on”有关系,与其他单词没有关系。
\item 源语单词$s_j$的生成概率$\funp{P}(s_j|a_1^{j},s_1^{j-1},m,\seq{t})$仅依赖与其对齐的译文单词$t_{a_j}$,即单词翻译概率$f(s_j|t_{a_j})$。此时单词翻译概率满足$\sum_{s_j}{f(s_j|t_{a_j})}=1$。比如在图\ref{fig:5-18}表示的例子中,源语单词“上”出现的概率只和与它对齐的单词“on”有关系,与其他单词没有关系。
\begin{eqnarray}
\funp{P}(s_j|a_1^{j},s_1^{j-1},m,\seq{t})& \equiv & f(s_j|t_{a_j})
\label{eq:5-23}
\end{eqnarray}
用一个简单的例子对公式\eqref{eq:5-23}进行说明。比如,在图\ref{fig:5-18}中,“桌子”对齐到“table”,可被描述为$f(s_2 |t_{a_2})=f(\textrm{“桌子”}|\textrm{“table”})$,表示给定“table”翻译为“桌子”的概率。通常,$f(s_2 |t_{a_2})$被认为是一种概率词典,它反应了两种语言词汇一级的对应关系。
用一个简单的例子对公式\eqref{eq:5-23}进行说明。比如,在图\ref{fig:5-18}中,“桌子”对齐到“table”,可被描述为$f(s_2 |t_{a_2})=f(\textrm{“桌子”}|\textrm{“table”})$,表示给定“table”翻译为“桌子”的概率。通常,$f(s_2 |t_{a_2})$被认为是一种概率词典,它反应了两种语言单词一级的对应关系。
\end{itemize}
\parinterval 将上述三个假设和公式\eqref{eq:5-19}代入公式\eqref{eq:5-18}中,得到$\funp{P}(\seq{s}|\seq{t})$的表达式:
......
......@@ -103,7 +103,7 @@
\label{eq:6-4}
\end{eqnarray}
\parinterval 类似于模型1,模型2的表达式\eqref{eq:6-4}也能被拆分为两部分进行理解。第一部分:遍历所有的$\seq{a}$;第二部分:对于每个$\seq{a}$累加对齐概率$\funp{P}(\seq{s},\seq{a}| \seq{t})$,即计算对齐概率$a(a_j|j,m,l)$词汇翻译概率$f(s_j|t_{a_j})$对于所有源语言位置的乘积。
\parinterval 类似于模型1,模型2的表达式\eqref{eq:6-4}也能被拆分为两部分进行理解。第一部分:遍历所有的$\seq{a}$;第二部分:对于每个$\seq{a}$累加对齐概率$\funp{P}(\seq{s},\seq{a}| \seq{t})$,即计算对齐概率$a(a_j|j,m,l)$单词翻译概率$f(s_j|t_{a_j})$对于所有源语言位置的乘积。
\parinterval 同样的,模型2的解码及训练优化和模型1的十分相似,在此不再赘述,详细推导过程可以参看{\chapterfive}\ref{IBM-model1}小节解码及计算优化部分。这里直接给出IBM模型2的最终表达式:
\begin{eqnarray}
......@@ -232,7 +232,7 @@
\vspace{0.5em}
\item 第二部分:对$i=0$时的产出率建模({\color{blue!70} 蓝色}),即空标记$t_0$的产出率生成概率。它依赖于$\seq{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$
\vspace{0.5em}
\item 第三部分:对词汇翻译建模({\color{green!70} 绿色}),目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率,依赖于$\seq{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$
\item 第三部分:对单词翻译建模({\color{green!70} 绿色}),目标语言单词$t_i$生成第$k$个源语言单词$\tau_{ik}$时的概率,依赖于$\seq{t}$、所有目标语言单词的产出率$\varphi_0^l$、区间$i\in[1,l]$的目标语言单词生成的源语言单词$\tau_1^{i-1}$和目标语单词$t_i$生成的前$k$个源语言单词$\tau_{i1}^{k-1}$
\vspace{0.5em}
\item 第四部分:对于每个$i\in[1,l]$的目标语言单词生成的源语言单词的扭曲度建模({\color{yellow!70!black} 黄色}),即第$i$个目标语言单词生成的第$k$个源语言单词在源文中的位置$\pi_{ik}$ 的概率。其中$\pi_1^{i-1}$ 表示区间$[1,i-1]$的目标语言单词生成的源语言单词的扭曲度,$\pi_{i1}^{k-1}$表示第$i$目标语言单词生成的前$k-1$个源语言单词的扭曲度。
\vspace{0.5em}
......@@ -445,7 +445,7 @@ p_0+p_1 & = & 1 \label{eq:6-21}
\parinterval 在IBM模型中,$\funp{P}(\seq{t})\funp{P}(\seq{s}| \seq{t})$会随着目标语言句子长度的增加而减少,因为这种模型有多个概率化的因素组成,乘积项越多结果的值越小。这也就是说,IBM模型会更倾向选择长度短一些的目标语言句子。显然这种对短句子的偏向性并不是机器翻译所期望的。
\parinterval 这个问题在很多机器翻译系统中都存在。它实际上也是一种{\small\bfnew{系统偏置}}\index{系统偏置}(System Bias)\index{System Bias}的体现。为了消除这种偏置,可以通过在模型中增加一个短句子惩罚因子来抵消掉模型对短句子的倾向性。比如,可以定义一个惩罚因子,它的值随着长度的减少而增加。不过,简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个基于判别式框架的翻译模型,这部分内容会在{\chapterseven}进行介绍。
\parinterval 这个问题在很多机器翻译系统中都存在。它实际上也是一种{\small\bfnew{系统偏置}}\index{系统偏置}(System Bias)\index{System Bias}的体现。为了消除这种偏置,可以通过在模型中增加一个短句子惩罚因子来抵消掉模型对短句子的倾向性。比如,可以定义一个惩罚因子,它的值随着长度的减少而增加。不过,简单引入这样的惩罚因子会导致模型并不符合一个严格的噪声信道模型。它对应一个基于判别式框架的翻译模型,这部分内容会在{\chapterseven}进行介绍。
%----------------------------------------------------------------------------------------
% NEW SUB-SECTION
......
......@@ -41,7 +41,7 @@
\node[scale=0.9,anchor=west,minimum size=18pt] (tw13) at ([yshift=0.1em,xshift=0.5em]tw12.east){worried};
\node[scale=0.9,anchor=west,minimum size=18pt] (tw14) at ([xshift=0.5em]tw13.east){about};
\node[scale=0.9,anchor=west,minimum size=18pt] (tw15) at ([xshift=0.5em]tw14.east){the};
\node[scale=0.9,anchor=west,minimum size=18pt] (tw16) at ([yshift=-0.1em,xshift=0.5em]tw15.east){situation};
\node[scale=0.9,anchor=west,minimum size=18pt] (tw16) at ([xshift=0.5em]tw15.east){situation};
\draw[dashed] ([xshift=-0.3em]cfrag1.south) -- ([yshift=-0.3em]tw11.north);
\draw[dashed] (cfrag2.south) -- ([yshift=-0.4em]tw14.north);
......
......@@ -245,7 +245,7 @@ r_3:\quad \funp{X}\ &\to\ &\langle \ \text{大幅度},\quad \textrm{drastically}
r_4:\quad \funp{X}\ &\to\ &\langle \ \text{},\quad \textrm{have}\ \rangle \nonumber
\end{eqnarray}
\noindent 其中,规则$r_1$$r_2$是含有变量的规则,这些变量可以被其他规则的右部替换;规则$r_2$是调序规则;规则$r_3$$r_4$是纯词汇化规则,表示单词或者短语的翻译。
\noindent 其中,规则$r_1$$r_2$是含有变量的规则,这些变量可以被其他规则的右部替换;规则$r_2$是调序规则;规则$r_3$$r_4$是纯单词化规则,表示单词或者短语的翻译。
\parinterval 对于一个双语句对:
\begin{eqnarray}
......@@ -389,7 +389,7 @@ y&=&\beta_0 y_{\pi_1} ... \beta_{m-1} y_{\pi_m} \beta_m
\vspace{0.5em}
\item ($h_{1-2}$)短语翻译概率(取对数),即$\textrm{log}(\funp{P}(\alpha \mid \beta))$$\textrm{log}(\funp{P}(\beta \mid \alpha))$,特征的计算与基于短语的模型完全一样;
\vspace{0.5em}
\item ($h_{3-4}$)词汇化翻译概率(取对数),即$\textrm{log}(\funp{P}_{\textrm{lex}}(\alpha \mid \beta))$$\textrm{log}(\funp{P}_{\textrm{lex}}(\beta \mid \alpha))$,特征的计算与基于短语的模型完全一样;
\item ($h_{3-4}$)单词化翻译概率(取对数),即$\textrm{log}(\funp{P}_{\textrm{lex}}(\alpha \mid \beta))$$\textrm{log}(\funp{P}_{\textrm{lex}}(\beta \mid \alpha))$,特征的计算与基于短语的模型完全一样;
\vspace{0.5em}
\item ($h_{5}$)翻译规则数量,让模型自动学习对规则数量的偏好,同时避免使用过少规则造成分数偏高的现象;
\vspace{0.5em}
......@@ -696,8 +696,8 @@ span\textrm{[0,4]}&=&\textrm{“猫} \quad \textrm{喜欢} \quad \textrm{吃} \q
&都是基于串的解码方法 \\
\rule{0pt}{15pt}基于森林 &(源语言)使用句法森林,这里森林只是对多个句法树的一 \\
& 种压缩结构表示 \\
\rule{0pt}{15pt}词汇化规则 & 含有终结符的规则 \\
\rule{0pt}{15pt}词汇规则 & 不含有终结符的规则 \\
\rule{0pt}{15pt}单词化规则 & 含有终结符的规则 \\
\rule{0pt}{15pt}单词规则 & 不含有终结符的规则 \\
\rule{0pt}{15pt}句法软约束 & 不强制规则推导匹配语言学句法树,通常把句法信息作为特\\
&征使用 \\
\rule{0pt}{15pt}句法硬约束 & 要求推导必须符合语言学句法树,不符合的推导会被过滤掉 \\
......@@ -1333,7 +1333,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
\end{eqnarray}
\noindent 于是,可以定义短语翻译概率特征为$\log(\textrm{P(}\tau( \alpha_r )|\tau( \beta_r )))$$\log(\textrm{P(}\tau( \beta_r )|\tau( \alpha_r )))$。它们的计算方法与基于短语的系统是完全一样的\footnote[9]{对于树到串规则,$\tau( \beta_r )$就是规则目标语言端的符号串。}
\vspace{0.5em}
\item ($h_{3-4}$) 词汇化翻译概率(取对数),即$\log(\funp{P}_{\textrm{lex}}(\tau( \alpha_r )|\tau( \beta_r )))$$\log(\funp{P}_{\textrm{lex}}(\tau( \beta_r )|\tau( \alpha_r )))$。这两个特征的计算方法与基于短语的系统也是一样的。
\item ($h_{3-4}$) 单词化翻译概率(取对数),即$\log(\funp{P}_{\textrm{lex}}(\tau( \alpha_r )|\tau( \beta_r )))$$\log(\funp{P}_{\textrm{lex}}(\tau( \beta_r )|\tau( \alpha_r )))$。这两个特征的计算方法与基于短语的系统也是一样的。
\vspace{0.5em}
\end{itemize}
......@@ -1362,7 +1362,7 @@ r_9: \quad \textrm{IP(}\textrm{NN}_1\ \textrm{VP}_2) \rightarrow \textrm{S(}\tex
\vspace{0.5em}
\item ($h_{11}$)组合规则的数量,学习对组合规则的偏好;
\vspace{0.5em}
\item ($h_{12}$)词汇化规则的数量,学习对含有终结符规则的偏好;
\item ($h_{12}$)单词化规则的数量,学习对含有终结符规则的偏好;
\vspace{0.5em}
\item ($h_{13}$)低频规则的数量,学习对训练数据中出现频次低于3的规则的偏好。低频规则大多不可靠,设计这个特征的目的也是为了区分不同质量的规则。
\end{itemize}
......@@ -1571,7 +1571,7 @@ d_1 & = & {d'} \circ {r_5}
\parinterval 对于这个问题,有两种常用的解决办法:
\begin{itemize}
\vspace{0.5em}
\item 对文法进行限制。比如,可以限制规则中变量的数量;或者不允许连续的变量,这样的规则也被称作满足{\small\bfnew{词汇化标准形式}}\index{词汇化标准形式}(Lexicalized Norm Form)\index{Lexicalized Norm Form} (LNF)的规则。比如,层次短语规则就是LNF规则。由于LNF 中单词(终结符)可以作为锚点,因此规则匹配时所有变量的匹配范围是固定的;
\item 对文法进行限制。比如,可以限制规则中变量的数量;或者不允许连续的变量,这样的规则也被称作满足{\small\bfnew{单词化标准形式}}\index{单词化标准形式}(Lexicalized Norm Form)\index{Lexicalized Norm Form} (LNF)的规则。比如,层次短语规则就是LNF规则。由于LNF 中单词(终结符)可以作为锚点,因此规则匹配时所有变量的匹配范围是固定的;
\vspace{0.5em}
\item 对规则进行二叉化,使用CKY方法进行分析。这个方法也是句法分析中常用的策略。所谓规则二叉化是把规则转化为最多只含两个变量或连续词串的规则(串到树规则)。比如,对于如下的规则:
\begin{eqnarray}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论