Commit cbd21156 by xiaotong

tree-to-tree rule extraction

parent e1a8ace5
...@@ -146,171 +146,375 @@ ...@@ -146,171 +146,375 @@
\subsection{翻译规则抽取} \subsection{翻译规则抽取}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% tree binarization \subsection{引入双语句法信息}
\begin{frame}{更多的规则 - 句法树二叉化}
%%%------------------------------------------------------------------------------------------------------------
%%% 树到树规则抽取
\begin{frame}{引入双语句法信息}
\begin{itemize}
\item 对于树到树模型,源语和目标语端都有句法树,需要使用树片段到树片段的映射来描述翻译过程,这种映射关系被描述为树到树翻译规则。这里,把\\
\vspace{-1.3em}
\begin{eqnarray}
\langle\ \textrm{VP}, \textrm{VP}\ \rangle & \to & \langle\ \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})), \nonumber \\
& & \ \ \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1}))\ \rangle \nonumber
\end{eqnarray}
表示为\alert{树片段到树片段}的映射形式\\
\vspace{-1.3em}
\begin{eqnarray}
& & \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})) \nonumber \\
& \to & \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1})) \nonumber
\end{eqnarray}
\item<2-> 可以通过扩展GHKM方法进行树到树规则抽取
\begin{itemize}
\item 双语端进行可信节点的识别,之后找到节点之间的对应
\item 基于对应的节点获得树片段的对应,即抽取树到树规则
\item 规则组合、SPMT等方法同样适用
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 方法1:利用词对齐归纳句法映射
\begin{frame}{方法1:利用词对齐归纳树到树规则}
\begin{itemize} \begin{itemize}
\item 句法分析器生成的句法树可能会非常平坦,这会导致抽取的规则很``大''而且规则无法继续被分解 \item 简单直接的方法是把GHKM方法扩展到双语的情况,利用词对齐归纳树到树映射
\begin{itemize} \begin{itemize}
\item 比如,在CTB中经常会看到很宽的子树结构 \item<3-> 但是词对齐的错误往往会导致很多规则无法抽取
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
\begin{minipage}[c][5cm][t]{0.47\textwidth}
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\begin{scope}
\begin{scope}[scale=0.65, level distance=27pt]
\Tree[.S
[.NP
[.DT \node(ew1){the}; ]
[.NNS \node(ew2){imports}; ]
]
[.VP
[.VBZ \node(ew3){have}; ]
[.ADVP
[.RB \node(ew4){drastically}; ]
[.VBN \node(ew5){fallen}; ]
]
]
]
\end{scope}
{\scriptsize \begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\begin{scope}[scale = 0.9, sibling distance=20pt, level distance=30pt]
{\footnotesize
\Tree[.IP \Tree[.IP
[.NP ] [.NN \node(cw1){进口}; ]
[.VP ] [.VP
[., ] [.AD \node(cw2){大幅度}; ]
[.VP ] [.VP
[., ] [.VV \node(cw3){下降}; ]
[.VP ] [.AS \node(cw4){}; ]
[., ] ]
[.VP ] ]
[.{.{\color{white} V}} ]
] ]
}
\end{scope} \end{scope}
\visible<2->{
\draw[-, dashed] (cw1) -- (ew2);
\draw[-, dashed] (cw2) -- (ew4);
\draw[-, dashed] (cw3) -- (ew5);
\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
} }
\visible<3->{
\draw[-, red, dashed,thick] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
}
\end{scope}
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
\end{minipage}
\begin{minipage}[c][5cm][t]{0.50\textwidth}
\visible<2->{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则}}} \\
\hline
\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
\end{tabular}
}
\visible<3->{
\vspace{0.5em}
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{无法得到的规则}}} \\
\hline
\scriptsize{$r_{?}$} & \scriptsize{AS(了) $\rightarrow$ VBZ(have)} \\
\scriptsize{$r_{?}$} & \scriptsize{NN(进口) $\rightarrow$} \\
& \scriptsize{NP(DT(the) NNS(imports))} \\
\scriptsize{$r_{?}$} & \scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)} \\
\end{tabular}
}
\end{minipage}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 方法2:直接进行节点对齐然后归纳句法映射
\begin{frame}{方法2:利用节点对齐抽取树到树规则}
\begin{itemize} \begin{itemize}
\item<2-> 一个例子 \item 另一种思路是直接获取源语言树节点到目标语树节点的对应关系,然后直接抽取规则,这样可避免词对齐错误
\begin{itemize}
\item 节点对其可以更准确的捕捉双语结构的对应
\end{itemize}
\end{itemize} \end{itemize}
\vspace{-1.0em} \begin{minipage}[c][5cm][t]{0.47\textwidth}
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
\visible<2->{ \only<1>{
{\scriptsize \begin{scope}
\begin{scope}[sibling distance=4pt, level distance=25pt] \begin{scope}[scale=0.65, level distance=27pt]
\Tree[.S
{\footnotesize [.NP
\Tree[.\node(n1){NP}; [.DT \node(ew1){the}; ]
[.NNP \node(sw1){美国}; ] [.NNS \node(ew2){imports}; ]
[.NN \node(sw2){总统}; ] ]
[.NN \node(sw3){唐纳德}; ] [.VP
[.NN \node(sw4){特朗普}; ] [.VBZ \node(ew3){have}; ]
[.ADVP
[.RB \node(ew4){drastically}; ]
[.VBN \node(ew5){fallen}; ]
]
]
] ]
} \end{scope}
\node [anchor=north] (tw1) at ([yshift=-2em]sw1.south) {U.S.}; \begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\node [anchor=north] (tw2) at ([yshift=-2em]sw2.south) {President}; \Tree[.IP
\node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump}; [.NN \node(cw1){进口}; ]
[.VP
[.AD \node(cw2){大幅度}; ]
[.VP
[.VV \node(cw3){下降}; ]
[.AS \node(cw4){}; ]
]
]
]
\end{scope}
\draw[-, dashed] (cw1) -- (ew2);
\draw[-, dashed] (cw2) -- (ew4);
\draw[-, dashed] (cw3) -- (ew5);
\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
\draw [-,dashed] (sw1.south) -- (tw1.north); \end{scope}
\draw [-,dashed] (sw2.south) -- (tw2.north); }
\draw [-,dashed] (sw3.south) -- (tw3.north);
\draw [-,dashed] (sw4.south) -- (tw3.north);
\node [anchor=west] (rulelabel1) at ([xshift=1in,yshift=0.3em]n1.east) {\footnotesize{\textbf{抽取到的规则:}}}; \begin{scope}
\node [anchor=north west] (rule1) at (rulelabel1.south west) {NP(NNP$_1$ NN$_2$ NN(唐纳德) NN(特朗普))};
\node [anchor=north west] (rule1t) at ([yshift=0.2em]rule1.south west) {$\to$ NNP$_1$ NN$_2$ Trump};
\node [anchor=north west] (rule2) at (rule1t.south west) {NP(NNP$_1$ NN(总统) NN(唐纳德) NN(特朗普))};
\node [anchor=north west] (rule2t) at ([yshift=0.2em]rule2.south west) {$\to$ NNP$_1$ President Trump};
\node [anchor=north west] (rulelabel2) at ([yshift=-0.3em]rule2t.south west) {\footnotesize{\textbf{\alert{不能}抽取到的规则:}}};
\node [anchor=north west] (rule3) at (rulelabel2.south west) {NP(NN(唐纳德) NN(特朗普)) $\to$ Trump};
\visible<2->{
\begin{scope}[scale=0.65, level distance=27pt]
\Tree[.\node[draw](en1){S};
[.\node[draw](en2){NP};
[.DT the ]
[.NNS imports ]
]
[.\node[draw](en3){VP};
[.\node[draw](en4){VBZ}; have ]
[.ADVP
[.\node[draw](en5){RB}; drastically ]
[.\node[draw](en6){VBN}; fallen ]
]
]
]
\end{scope}
\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\Tree[.\node[draw](cn1){\ \ IP\ \ };
[.\node[draw](cn2){NN}; 进口 ]
[.\node[draw](cn3){VP};
[.\node[draw](cn4){AD}; 大幅度 ]
[.VP
[.\node[draw](cn5){VV}; 下降 ]
[.\node[draw](cn6){AS}; 了 ]
]
]
]
\end{scope} \end{scope}
} }
\visible<3->{
\draw[latex-latex, dotted, thick, red] (cn4.east) .. controls +(east:0.5) and +(west:0.5) .. (en5.west);
\draw[latex-latex, dotted, thick, red] (cn5.east) .. controls +(east:0.5) and +(south:0.5) .. (en6.south west);
\draw[latex-latex, dotted, thick, red] (cn6.north west) .. controls +(north:1.5) and +(south:2.5) .. (en4.south west);
\draw[latex-latex, dotted, thick, red] (cn3.north west) -- (en3.south west);
\draw[latex-latex, dotted, thick, red] (cn2.west) .. controls +(west:0.6) and +(west:0.6) .. (en2.west);
\draw[latex-latex, dotted, thick, red] (cn1.north west) .. controls +(north:4) and +(south:5.5) .. (en1.south west);
} }
\end{scope}
\end{tikzpicture} \end{tikzpicture}
\end{center} \end{center}
\end{minipage}
\begin{minipage}[c][5cm][t]{0.50\textwidth}
\only<1>{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(词对齐)}}} \\
\hline
\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
\end{tabular}
}
\visible<4->{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(子树对齐)}}} \\
\hline
{\color{gray!70} \scriptsize{$r_1$}} & {\color{gray!70} \scriptsize{AS(了) $\rightarrow$ DT(the)}} \\
{\color{gray!70} \scriptsize{$r_2$}} & {\color{gray!70}\scriptsize{NN(进口) $\rightarrow$ NNS(imports)}} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
{\color{gray!70} \scriptsize{$r_5$}} & {\color{gray!70} \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$}} \\
\multicolumn{2}{l}{{\color{gray!70} \tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}}} \\
\alert{\scriptsize{$r_6$}} & \alert{\scriptsize{AS(了) $\rightarrow$ VBZ(have)}} \\
\alert{\scriptsize{$r_7$}} & \alert{\scriptsize{NN(进口) $\rightarrow$ }} \\
& \alert{\scriptsize{NP(DT(the) NNS(imports))}}\\
\alert{\scriptsize{$r_8$}} & \alert{\scriptsize{VP(AD$_1$ VP(VV$_2$ AS$_3$)) $\rightarrow$}} \\
& \alert{\scriptsize{VP(VBZ$_3$ ADVP(RB$_1$ VBN$_2$)}} \\
\alert{\scriptsize{$r_9$}} & \alert{\scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)}} \\
\end{tabular}
}
\end{minipage}
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% tree binarization (cont.) %%% 抽取更多的规则:节点对齐矩阵
\begin{frame}{更多的规则 - 句法树二叉化(续)} \begin{frame}{节点对齐矩阵}
\begin{itemize} \begin{itemize}
\item 一种解决问题的思路是用二叉化方法把树结构变得更深 \item 节点对齐的自动获取:1)基于分类模型的方法;2)无指导节点对齐的方法
\item 使用节点对齐的另一个好处是,我们可以直接用节点对齐矩阵进行规则抽取,而不是用单一的对齐结果
\begin{itemize}
\item 对齐矩阵可以帮助抽取更多样的规则
\end{itemize}
\end{itemize}
\vspace{-1.5em} \vspace{-0.2em}
\begin{center} \centering
\begin{tikzpicture} \begin{tikzpicture}
{\scriptsize \begin{scope}[scale=0.7]
\begin{scope}[sibling distance=4pt, level distance=25pt]
\Tree[.\node(n1){NP}; \begin{scope}[sibling distance=17pt, level distance=25pt]
[.NNP \node(sw1){美国}; ] \Tree[.\node(en1){VP$^{[1]}$};
[.NN \node(sw2){总统}; ] [.\node(en2){VBZ$^{[2]}$}; have ]
[.NN \node(sw3){唐纳德}; ] [.\node(en3){ADVP$^{[3]}$};
[.NN \node(sw4){特朗普}; ] [.\node(en4){RB$^{[4]}$}; drastically ]
[.\node(en5){VBN$^{[5]}$}; fallen ]
]
] ]
\node [anchor=north] (tw1) at ([yshift=-2em]sw1.south) {U.S.};
\node [anchor=north] (tw2) at ([yshift=-2em]sw2.south) {President};
\node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump};
\draw [-,dashed] (sw1.south) -- (tw1.north);
\draw [-,dashed] (sw2.south) -- (tw2.north);
\draw [-,dashed] (sw3.south) -- (tw3.north);
\draw [-,dashed] (sw4.south) -- (tw3.north);
\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\tiny{二叉化}};
\end{scope} \end{scope}
\begin{scope}[xshift=2.2in,sibling distance=10pt, level distance=15pt] \begin{scope}[grow'=up, yshift=-2.7in, sibling distance=32pt, level distance=25pt]
\Tree[.\node(cn1){VP$^{[1]}$};
\Tree[.\node(n1){NP}; [.\node(cn2){AD$^{[2]}$}; 大幅度 ]
[.NNP \node(sw1){美国}; ] [.\node(cn3){VP$^{[3]}$};
[.NP-BAR [.\node(cn4){VV$^{[4]}$}; 下降 ]
[.NN \node(sw2){总统}; ] [.\node(cn5){AS$^{[5]}$}; 了 ]
[.NP-BAR ]
[.NN \node(sw3){唐纳德}; ]
[.NN \node(sw4){特朗普}; ]
]
]
] ]
\end{scope}
\node [anchor=north] (tw1) at ([yshift=-4.5em]sw1.south) {U.S.}; \begin{scope}[xshift=1.7in, yshift=-0.4in]
\node [anchor=north] (tw2) at ([yshift=-2.75em]sw2.south) {President}; \node[anchor=west, rotate=60] at (0.8,-0.6) {VP$^{[1]}$};
\node [anchor=north] (tw3) at ([yshift=-1em]sw3.south) {Trump}; \node[anchor=west, rotate=60] at (1.8,-0.6) {VBZ$^{[2]}$};
\node[anchor=west, rotate=60] at (2.8,-0.6) {ADVP$^{[3]}$};
\node[anchor=west, rotate=60] at (3.8,-0.6) {RB$^{[4]}$};
\node[anchor=west, rotate=60] at (4.8,-0.6) {VBN$^{[5]}$};
\node[] at (6.2,-1) {VP$^{[1]}$};
\node[] at (6.2,-2) {AD$^{[2]}$};
\node[] at (6.2,-3) {VP$^{[3]}$};
\node[] at (6.2,-4) {VV$^{[4]}$};
\node[] at (6.2,-5) {AS$^{[5]}$};
\foreach \i in {1,...,5}{
\foreach \j in {-5,...,-1}{
\node[fill=blue,scale=0.2] at (\i,\j) {};
}
}
\draw [-,dashed] (sw1.south) -- (tw1.north); \visible<2-3>{
\draw [-,dashed] (sw2.south) -- (tw2.north); \node[fill=blue, scale=1.2] at (1,-1) {};
\draw [-,dashed] (sw3.south) -- (tw3.north); \node[fill=blue, scale=1.2] at (4,-2) {};
\draw [-,dashed] (sw4.south) -- (tw3.north); \node[fill=blue, scale=1.2] at (2,-5) {};
}
\end{scope} \visible<2>{
\node[fill=blue, scale=1.2] at (5,-4) {};
}
\visible<3>{
\node[fill=red, scale=1.2] at (5,-4) {};
} }
\end{tikzpicture} \visible<4-5>{
\end{center} \node[fill=blue, scale=1.1] at (1,-1) {};
\node[fill=blue, scale=0.5] at (1,-3) {};
\node[fill=blue, scale=0.6] at (2,-2) {};
\node[fill=blue, scale=0.7] at (2,-3) {};
\node[fill=blue, scale=0.7] at (2,-5) {};
\node[fill=blue, scale=0.4] at (3,-1) {};
\node[fill=blue, scale=0.6] at (3,-2) {};
\node[fill=blue, scale=0.5] at (3,-3) {};
\node[fill=blue, scale=0.9] at (4,-2) {};
\node[fill=blue, scale=0.7] at (5,-3) {};
\node[fill=blue, scale=0.4] at (5,-5) {};
}
\visible<2->{ \visible<4>{
\small{二叉化增加了更多的可信节点,这也带来了新的规则} \node[fill=blue, scale=0.6] at (3,-4) {};
\node[fill=blue, scale=0.8] at (5,-4) {};
}
\begin{center} \visible<5>{
{\footnotesize \node[fill=red, scale=0.6] at (3,-4) {};
\vspace{0.3em} \node[fill=red, scale=0.8] at (5,-4) {};
NP-BAR(NN(唐纳德) NN(特朗普)) $\to$ Trump \\
\vspace{0.3em}
NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
\vspace{0.3em}
} }
\end{center}
\visible<2-3>{
\node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = extractable node-pair}};
} }
\item<3-> 树二叉化已经成为基于句法机器翻译模型的常用方法 \visible<4-5>{
\begin{itemize} \node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = possible alignment}};
\item 有很多策略:左优先、右优先、head优先等等 }
\item 二叉化可以得到更多(细粒度)规则,保证规则的覆盖度
\end{itemize}
\end{itemize} \end{scope}
\visible<3>{\draw[<->, red, thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
\visible<5>{\draw[<->, red, dotted, very thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
\visible<5>{\draw[<->, red, dotted, very thick] (cn4.west) .. controls +(west:1.0) and +(west:2) .. (en3.west);}
\end{scope}
\end{tikzpicture}
\end{frame} \end{frame}
......
...@@ -2713,7 +2713,7 @@ $\textrm{VP(VV(提高) NN}_1) \to \textrm{increases\ NN}_1$ \\ ...@@ -2713,7 +2713,7 @@ $\textrm{VP(VV(提高) NN}_1) \to \textrm{increases\ NN}_1$ \\
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\subsection{翻译规则抽取} \subsection{树到串翻译规则抽取}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 基于树结构的翻译文法 - 树到串/串到树 %%% 基于树结构的翻译文法 - 树到串/串到树
...@@ -3426,6 +3426,8 @@ NP(PN$_1$) $\to$ PN$_1$ ...@@ -3426,6 +3426,8 @@ NP(PN$_1$) $\to$ PN$_1$
\end{itemize} \end{itemize}
\end{frame} \end{frame}
\subsection{更多样的规则}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 处理空对其单词 %%% 处理空对其单词
\begin{frame}{更多的规则 - 处理空对齐} \begin{frame}{更多的规则 - 处理空对齐}
...@@ -3710,7 +3712,7 @@ NP(PN$_1$) $\to$ PN$_1$ ...@@ -3710,7 +3712,7 @@ NP(PN$_1$) $\to$ PN$_1$
\end{itemize} \end{itemize}
\end{itemize} \end{itemize}
\vspace{-0.3em} \vspace{-0.1em}
\begin{minipage}[b]{0.47\textwidth} \begin{minipage}[b]{0.47\textwidth}
{\footnotesize {\footnotesize
\visible<3->{ \visible<3->{
...@@ -3912,7 +3914,7 @@ VP(P(对) NP(NN(局势)) VP$_1$) $\to$ VP$_1$ about the situation \\ ...@@ -3912,7 +3914,7 @@ VP(P(对) NP(NN(局势)) VP$_1$) $\to$ VP$_1$ about the situation \\
\item 一种解决问题的思路是用二叉化方法把树结构变得更深 \item 一种解决问题的思路是用二叉化方法把树结构变得更深
\vspace{-1.5em} \vspace{-0.5em}
\begin{center} \begin{center}
\begin{tikzpicture} \begin{tikzpicture}
...@@ -3992,13 +3994,380 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$ ...@@ -3992,13 +3994,380 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
\end{frame} \end{frame}
\subsection{引入双语句法信息}
%%%------------------------------------------------------------------------------------------------------------
%%% 树到树规则抽取
\begin{frame}{引入双语句法信息}
\begin{itemize}
\item 对于树到树模型,源语和目标语端都有句法树,需要使用树片段到树片段的映射来描述翻译过程,这种映射关系被描述为树到树翻译规则。这里,把\\
\vspace{-1.3em}
\begin{eqnarray}
\langle\ \textrm{VP}, \textrm{VP}\ \rangle & \to & \langle\ \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})), \nonumber \\
& & \ \ \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1}))\ \rangle \nonumber
\end{eqnarray}
表示为\alert{树片段到树片段}的映射形式\\
\vspace{-1.3em}
\begin{eqnarray}
& & \textrm{VP(PP}_{1}\ \textrm{VP(VV(表示) NN}_{2})) \nonumber \\
& \to & \textrm{VP(VBZ(was) VP(VBZ}_{2}\ \textrm{PP}_{1})) \nonumber
\end{eqnarray}
\item<2-> 可以通过扩展GHKM方法进行树到树规则抽取
\begin{itemize}
\item 双语端进行可信节点的识别,之后找到节点之间的对应
\item 基于对应的节点获得树片段的对应,即抽取树到树规则
\item 规则组合、SPMT等方法同样适用
\end{itemize}
\end{itemize}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 方法1:利用词对齐归纳句法映射
\begin{frame}{方法1:利用词对齐归纳树到树规则}
\begin{itemize}
\item 简单直接的方法是把GHKM方法扩展到双语的情况,利用词对齐归纳树到树映射
\begin{itemize}
\item<3-> 但是词对齐的错误往往会导致很多规则无法抽取
\end{itemize}
\end{itemize}
\begin{minipage}[c][5cm][t]{0.47\textwidth}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\begin{scope}[scale=0.65, level distance=27pt]
\Tree[.S
[.NP
[.DT \node(ew1){the}; ]
[.NNS \node(ew2){imports}; ]
]
[.VP
[.VBZ \node(ew3){have}; ]
[.ADVP
[.RB \node(ew4){drastically}; ]
[.VBN \node(ew5){fallen}; ]
]
]
]
\end{scope}
\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\Tree[.IP
[.NN \node(cw1){进口}; ]
[.VP
[.AD \node(cw2){大幅度}; ]
[.VP
[.VV \node(cw3){下降}; ]
[.AS \node(cw4){}; ]
]
]
]
\end{scope}
\visible<2->{
\draw[-, dashed] (cw1) -- (ew2);
\draw[-, dashed] (cw2) -- (ew4);
\draw[-, dashed] (cw3) -- (ew5);
\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
}
\visible<3->{
\draw[-, red, dashed,thick] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
}
\end{scope}
\end{tikzpicture}
\end{center}
\end{minipage}
\begin{minipage}[c][5cm][t]{0.50\textwidth}
\visible<2->{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则}}} \\
\hline
\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
\end{tabular}
}
\visible<3->{
\vspace{0.5em}
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{无法得到的规则}}} \\
\hline
\scriptsize{$r_{?}$} & \scriptsize{AS(了) $\rightarrow$ VBZ(have)} \\
\scriptsize{$r_{?}$} & \scriptsize{NN(进口) $\rightarrow$} \\
& \scriptsize{NP(DT(the) NNS(imports))} \\
\scriptsize{$r_{?}$} & \scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)} \\
\end{tabular}
}
\end{minipage}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------
%%% 方法2:直接进行节点对齐然后归纳句法映射
\begin{frame}{方法2:利用节点对齐抽取树到树规则}
\begin{itemize}
\item 另一种思路是直接获取源语言树节点到目标语树节点的对应关系,然后直接抽取规则,这样可避免词对齐错误
\begin{itemize}
\item 节点对其可以更准确的捕捉双语结构的对应
\end{itemize}
\end{itemize}
\begin{minipage}[c][5cm][t]{0.47\textwidth}
\begin{center}
\begin{tikzpicture}
\only<1>{
\begin{scope}
\begin{scope}[scale=0.65, level distance=27pt]
\Tree[.S
[.NP
[.DT \node(ew1){the}; ]
[.NNS \node(ew2){imports}; ]
]
[.VP
[.VBZ \node(ew3){have}; ]
[.ADVP
[.RB \node(ew4){drastically}; ]
[.VBN \node(ew5){fallen}; ]
]
]
]
\end{scope}
\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\Tree[.IP
[.NN \node(cw1){进口}; ]
[.VP
[.AD \node(cw2){大幅度}; ]
[.VP
[.VV \node(cw3){下降}; ]
[.AS \node(cw4){}; ]
]
]
]
\end{scope}
\draw[-, dashed] (cw1) -- (ew2);
\draw[-, dashed] (cw2) -- (ew4);
\draw[-, dashed] (cw3) -- (ew5);
\draw[-, dashed] (cw4) .. controls +(north:1.0) and +(south:1.6) .. (ew1);
\end{scope}
}
\begin{scope}
\visible<2->{
\begin{scope}[scale=0.65, level distance=27pt]
\Tree[.\node[draw](en1){S};
[.\node[draw](en2){NP};
[.DT the ]
[.NNS imports ]
]
[.\node[draw](en3){VP};
[.\node[draw](en4){VBZ}; have ]
[.ADVP
[.\node[draw](en5){RB}; drastically ]
[.\node[draw](en6){VBN}; fallen ]
]
]
]
\end{scope}
\begin{scope}[scale=0.65, level distance=27pt, grow'=up, xshift=-13pt, yshift=-3.5in, sibling distance=22pt]
\Tree[.\node[draw](cn1){\ \ IP\ \ };
[.\node[draw](cn2){NN}; 进口 ]
[.\node[draw](cn3){VP};
[.\node[draw](cn4){AD}; 大幅度 ]
[.VP
[.\node[draw](cn5){VV}; 下降 ]
[.\node[draw](cn6){AS}; 了 ]
]
]
]
\end{scope}
}
\visible<3->{
\draw[latex-latex, dotted, thick, red] (cn4.east) .. controls +(east:0.5) and +(west:0.5) .. (en5.west);
\draw[latex-latex, dotted, thick, red] (cn5.east) .. controls +(east:0.5) and +(south:0.5) .. (en6.south west);
\draw[latex-latex, dotted, thick, red] (cn6.north west) .. controls +(north:1.5) and +(south:2.5) .. (en4.south west);
\draw[latex-latex, dotted, thick, red] (cn3.north west) -- (en3.south west);
\draw[latex-latex, dotted, thick, red] (cn2.west) .. controls +(west:0.6) and +(west:0.6) .. (en2.west);
\draw[latex-latex, dotted, thick, red] (cn1.north west) .. controls +(north:4) and +(south:5.5) .. (en1.south west);
}
\end{scope}
\end{tikzpicture}
\end{center}
\end{minipage}
\begin{minipage}[c][5cm][t]{0.50\textwidth}
\only<1>{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(词对齐)}}} \\
\hline
\scriptsize{$r_1$} & \scriptsize{AS(了) $\rightarrow$ DT(the)} \\
\scriptsize{$r_2$} & \scriptsize{NN(进口) $\rightarrow$ NNS(imports)} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
\scriptsize{$r_5$} & \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$} \\
\multicolumn{2}{l}{\tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}} \\
\end{tabular}
}
\visible<4->{
\begin{tabular}{l l}
\multicolumn{2}{l}{\textbf{\scriptsize{抽取得到的规则(子树对齐)}}} \\
\hline
{\color{gray!70} \scriptsize{$r_1$}} & {\color{gray!70} \scriptsize{AS(了) $\rightarrow$ DT(the)}} \\
{\color{gray!70} \scriptsize{$r_2$}} & {\color{gray!70}\scriptsize{NN(进口) $\rightarrow$ NNS(imports)}} \\
\scriptsize{$r_3$} & \scriptsize{AD(大幅度) $\rightarrow$ RB(drastically)} \\
\scriptsize{$r_4$} & \scriptsize{VV(下降) $\rightarrow$ VBN(fallen)} \\
{\color{gray!70} \scriptsize{$r_5$}} & {\color{gray!70} \scriptsize{IP(NN$_1$ VP(AD$_2$ VP(VV$_3$ AS$_4$)) $\rightarrow$}} \\
\multicolumn{2}{l}{{\color{gray!70} \tiny{S(NP(DT$_4$ NNS$_1$) VP(VBZ(have) ADVP(RB$_2$ VBN$_3$))}}} \\
\alert{\scriptsize{$r_6$}} & \alert{\scriptsize{AS(了) $\rightarrow$ VBZ(have)}} \\
\alert{\scriptsize{$r_7$}} & \alert{\scriptsize{NN(进口) $\rightarrow$ }} \\
& \alert{\scriptsize{NP(DT(the) NNS(imports))}}\\
\alert{\scriptsize{$r_8$}} & \alert{\scriptsize{VP(AD$_1$ VP(VV$_2$ AS$_3$)) $\rightarrow$}} \\
& \alert{\scriptsize{VP(VBZ$_3$ ADVP(RB$_1$ VBN$_2$)}} \\
\alert{\scriptsize{$r_9$}} & \alert{\scriptsize{IP(NN$_1$ VP$_2$) $\rightarrow$ S(NP$_1$ VP$_2$)}} \\
\end{tabular}
}
\end{minipage}
\end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 基于树结构的翻译文法 - 树到树 %%% 抽取更多的规则:节点对齐矩阵
\begin{frame}{树到树规则抽取} \begin{frame}{节点对齐矩阵}
% 我的博士论文 \begin{itemize}
\item 节点对齐的自动获取:1)基于分类模型的方法;2)无指导节点对齐的方法
\item 使用节点对齐的另一个好处是,我们可以直接用节点对齐矩阵进行规则抽取,而不是用单一的对齐结果
\begin{itemize}
\item 对齐矩阵可以帮助抽取更多样的规则
\end{itemize}
\end{itemize}
\vspace{-0.2em}
\centering
\begin{tikzpicture}
\begin{scope}[scale=0.7]
\begin{scope}[sibling distance=17pt, level distance=25pt]
\Tree[.\node(en1){VP$^{[1]}$};
[.\node(en2){VBZ$^{[2]}$}; have ]
[.\node(en3){ADVP$^{[3]}$};
[.\node(en4){RB$^{[4]}$}; drastically ]
[.\node(en5){VBN$^{[5]}$}; fallen ]
]
]
\end{scope}
\begin{scope}[grow'=up, yshift=-2.7in, sibling distance=32pt, level distance=25pt]
\Tree[.\node(cn1){VP$^{[1]}$};
[.\node(cn2){AD$^{[2]}$}; 大幅度 ]
[.\node(cn3){VP$^{[3]}$};
[.\node(cn4){VV$^{[4]}$}; 下降 ]
[.\node(cn5){AS$^{[5]}$}; 了 ]
]
]
\end{scope}
\begin{scope}[xshift=1.7in, yshift=-0.4in]
\node[anchor=west, rotate=60] at (0.8,-0.6) {VP$^{[1]}$};
\node[anchor=west, rotate=60] at (1.8,-0.6) {VBZ$^{[2]}$};
\node[anchor=west, rotate=60] at (2.8,-0.6) {ADVP$^{[3]}$};
\node[anchor=west, rotate=60] at (3.8,-0.6) {RB$^{[4]}$};
\node[anchor=west, rotate=60] at (4.8,-0.6) {VBN$^{[5]}$};
\node[] at (6.2,-1) {VP$^{[1]}$};
\node[] at (6.2,-2) {AD$^{[2]}$};
\node[] at (6.2,-3) {VP$^{[3]}$};
\node[] at (6.2,-4) {VV$^{[4]}$};
\node[] at (6.2,-5) {AS$^{[5]}$};
\foreach \i in {1,...,5}{
\foreach \j in {-5,...,-1}{
\node[fill=blue,scale=0.2] at (\i,\j) {};
}
}
\visible<2-3>{
\node[fill=blue, scale=1.2] at (1,-1) {};
\node[fill=blue, scale=1.2] at (4,-2) {};
\node[fill=blue, scale=1.2] at (2,-5) {};
}
\visible<2>{
\node[fill=blue, scale=1.2] at (5,-4) {};
}
\visible<3>{
\node[fill=red, scale=1.2] at (5,-4) {};
}
\visible<4-5>{
\node[fill=blue, scale=1.1] at (1,-1) {};
\node[fill=blue, scale=0.5] at (1,-3) {};
\node[fill=blue, scale=0.6] at (2,-2) {};
\node[fill=blue, scale=0.7] at (2,-3) {};
\node[fill=blue, scale=0.7] at (2,-5) {};
\node[fill=blue, scale=0.4] at (3,-1) {};
\node[fill=blue, scale=0.6] at (3,-2) {};
\node[fill=blue, scale=0.5] at (3,-3) {};
\node[fill=blue, scale=0.9] at (4,-2) {};
\node[fill=blue, scale=0.7] at (5,-3) {};
\node[fill=blue, scale=0.4] at (5,-5) {};
}
\visible<4>{
\node[fill=blue, scale=0.6] at (3,-4) {};
\node[fill=blue, scale=0.8] at (5,-4) {};
}
\visible<5>{
\node[fill=red, scale=0.6] at (3,-4) {};
\node[fill=red, scale=0.8] at (5,-4) {};
}
\visible<2-3>{
\node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = extractable node-pair}};
}
\visible<4-5>{
\node[] at (4,-5.8) {\footnotesize{{\color{blue} $\blacksquare$} = possible alignment}};
}
\end{scope}
\visible<3>{\draw[<->, red, thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
\visible<5>{\draw[<->, red, dotted, very thick] (cn4.east) .. controls +(east:0.9) and +(west:0.9) .. (en5.west);}
\visible<5>{\draw[<->, red, dotted, very thick] (cn4.west) .. controls +(west:1.0) and +(west:2) .. (en3.west);}
\end{scope}
\end{tikzpicture}
\end{frame} \end{frame}
\subsection{翻译特征}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
%%% 翻译特征 %%% 翻译特征
\begin{frame}{翻译特征} \begin{frame}{翻译特征}
...@@ -4021,6 +4390,9 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$ ...@@ -4021,6 +4390,9 @@ NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
\end{frame} \end{frame}
%%%------------------------------------------------------------------------------------------------------------ %%%------------------------------------------------------------------------------------------------------------
\subsection{改进方法}
%%%------------------------------------------------------------------------------------------------------------
%%% 改进1 %%% 改进1
\begin{frame}{改进:基于森林的翻译模型} \begin{frame}{改进:基于森林的翻译模型}
\end{frame} \end{frame}
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论