binarized trees

e1a8ace5 · xiaotong · c5d54017 · e1a8ace5 · e1a8ace5
Commit e1a8ace5 authored Jan 02, 2020 by xiaotong
--- a/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04-test.tex
@@ -181,7 +181,7 @@
 \end{center}

 \begin{itemize}
-\item<2-> 一个具体的例子
+\item<2-> 一个例子
 \end{itemize}

 \vspace{-1.0em}
@@ -206,16 +206,16 @@
 \node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump};

 \draw [-,dashed] (sw1.south) -- (tw1.north);
-\draw [-] (sw2.south) -- (tw2.north);
-\draw [-] (sw3.south) -- (tw3.north);
-\draw [-] (sw4.south) -- (tw3.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);

-\node [anchor=west] (rulelabel1) at ([xshift=1in,yshift=0em]n1.east) {\footnotesize{\textbf{抽取到的规则：}}};
+\node [anchor=west] (rulelabel1) at ([xshift=1in,yshift=0.3em]n1.east) {\footnotesize{\textbf{抽取到的规则：}}};
 \node [anchor=north west] (rule1) at (rulelabel1.south west) {NP(NNP$_1$ NN$_2$ NN(唐纳德) NN(特朗普))};
 \node [anchor=north west] (rule1t) at ([yshift=0.2em]rule1.south west) {$\to$ NNP$_1$ NN$_2$ Trump};
 \node [anchor=north west] (rule2) at (rule1t.south west) {NP(NNP$_1$ NN(总统) NN(唐纳德) NN(特朗普))};
 \node [anchor=north west] (rule2t) at ([yshift=0.2em]rule2.south west) {$\to$ NNP$_1$ President Trump};
-\node [anchor=north west] (rulelabel2) at (rule2t.south west) {\footnotesize{\textbf{\alert{不能}抽取到的规则：}}};
+\node [anchor=north west] (rulelabel2) at ([yshift=-0.3em]rule2t.south west) {\footnotesize{\textbf{\alert{不能}抽取到的规则：}}};
 \node [anchor=north west] (rule3) at (rulelabel2.south west) {NP(NN(唐纳德) NN(特朗普)) $\to$ Trump};

 \end{scope}
@@ -231,11 +231,87 @@
 %%%  tree binarization (cont.)
 \begin{frame}{更多的规则 - 句法树二叉化(续)}
 \begin{itemize}
-\item 句法分析器生成的句法树可能会非常平坦，这会导致抽取的规则很``大''而且规则无法继续被分解
+\item 一种解决问题的思路是用二叉化方法把树结构变得更深
+
+
+\vspace{-1.5em}
+\begin{center}
+\begin{tikzpicture}
+
+{\scriptsize
+\begin{scope}[sibling distance=4pt, level distance=25pt]
+
+\Tree[.\node(n1){NP};
+     	[.NNP \node(sw1){美国}; ]
+     	[.NN \node(sw2){总统}; ]
+        [.NN \node(sw3){唐纳德}; ]
+        [.NN \node(sw4){特朗普}; ]
+     ]
+
+\node [anchor=north] (tw1) at ([yshift=-2em]sw1.south) {U.S.};
+\node [anchor=north] (tw2) at ([yshift=-2em]sw2.south) {President};
+\node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump};
+
+\draw [-,dashed] (sw1.south) -- (tw1.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);
+
+\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\tiny{二叉化}};
+
+\end{scope}
+
+\begin{scope}[xshift=2.2in,sibling distance=10pt, level distance=15pt]
+
+\Tree[.\node(n1){NP};
+     	[.NNP \node(sw1){美国}; ]
+	[.NP-BAR
+     	    [.NN \node(sw2){总统}; ]
+	    [.NP-BAR
+                [.NN \node(sw3){唐纳德}; ]
+                [.NN \node(sw4){特朗普}; ]
+             ]
+         ]
+     ]
+
+\node [anchor=north] (tw1) at ([yshift=-4.5em]sw1.south) {U.S.};
+\node [anchor=north] (tw2) at ([yshift=-2.75em]sw2.south) {President};
+\node [anchor=north] (tw3) at ([yshift=-1em]sw3.south) {Trump};
+
+\draw [-,dashed] (sw1.south) -- (tw1.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);
+
+\end{scope}
+
+}
+
+\end{tikzpicture}
+\end{center}
+
+\visible<2->{
+\small{二叉化增加了更多的可信节点，这也带来了新的规则}
+
+\begin{center}
+{\footnotesize
+\vspace{0.3em}
+NP-BAR(NN(唐纳德) NN(特朗普)) $\to$ Trump \\
+\vspace{0.3em}
+NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
+\vspace{0.3em}
+}
+\end{center}
+}
+
+\item<3-> 树二叉化已经成为基于句法机器翻译模型的常用方法
 	\begin{itemize}
-	\item 比如，在CTB中经常会看到很宽的子树结构
+	\item 有很多策略：左优先、右优先、head优先等等
+	\item 二叉化可以得到更多(细粒度)规则，保证规则的覆盖度
 	\end{itemize}
+
 \end{itemize}
+
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------

--- a/Section04-Phrasal-and-Syntactic-Models/section04.tex
+++ b/Section04-Phrasal-and-Syntactic-Models/section04.tex
@@ -3823,6 +3823,175 @@ VP(P(对) NP(NN(局势)) VP$_1$) $\to$ VP$_1$ about the situation \\

 \end{frame}

+%%%------------------------------------------------------------------------------------------------------------
+%%%  tree binarization
+\begin{frame}{更多的规则 - 句法树二叉化}
+\begin{itemize}
+\item 句法分析器生成的句法树可能会非常平坦，这会导致抽取的规则很``大''而且规则无法继续被分解
+	\begin{itemize}
+	\item 比如，在CTB中经常会看到很宽的子树结构
+	\end{itemize}
+\end{itemize}
+
+\begin{center}
+\begin{tikzpicture}
+
+{\scriptsize
+\begin{scope}[scale = 0.9, sibling distance=20pt, level distance=30pt]
+
+{\footnotesize
+\Tree[.IP
+     	[.NP ]
+     	[.VP ]
+        [., ]
+        [.VP ]
+        [., ]
+        [.VP ]
+        [., ]
+        [.VP ]
+        [.{.{\color{white} V}} ]
+     ]
+}
+\end{scope}
+}
+
+\end{tikzpicture}
+\end{center}
+
+\begin{itemize}
+\item<2-> 一个例子
+\end{itemize}
+
+\vspace{-1.0em}
+\begin{center}
+\begin{tikzpicture}
+
+\visible<2->{
+{\scriptsize
+\begin{scope}[sibling distance=4pt, level distance=25pt]
+
+{\footnotesize
+\Tree[.\node(n1){NP};
+     	[.NNP \node(sw1){美国}; ]
+     	[.NN \node(sw2){总统}; ]
+        [.NN \node(sw3){唐纳德}; ]
+        [.NN \node(sw4){特朗普}; ]
+     ]
+}
+
+\node [anchor=north] (tw1) at ([yshift=-2em]sw1.south) {U.S.};
+\node [anchor=north] (tw2) at ([yshift=-2em]sw2.south) {President};
+\node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump};
+
+\draw [-,dashed] (sw1.south) -- (tw1.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);
+
+\node [anchor=west] (rulelabel1) at ([xshift=1in,yshift=0.3em]n1.east) {\footnotesize{\textbf{抽取到的规则：}}};
+\node [anchor=north west] (rule1) at (rulelabel1.south west) {NP(NNP$_1$ NN$_2$ NN(唐纳德) NN(特朗普))};
+\node [anchor=north west] (rule1t) at ([yshift=0.2em]rule1.south west) {$\to$ NNP$_1$ NN$_2$ Trump};
+\node [anchor=north west] (rule2) at (rule1t.south west) {NP(NNP$_1$ NN(总统) NN(唐纳德) NN(特朗普))};
+\node [anchor=north west] (rule2t) at ([yshift=0.2em]rule2.south west) {$\to$ NNP$_1$ President Trump};
+\node [anchor=north west] (rulelabel2) at ([yshift=-0.3em]rule2t.south west) {\footnotesize{\textbf{\alert{不能}抽取到的规则：}}};
+\node [anchor=north west] (rule3) at (rulelabel2.south west) {NP(NN(唐纳德) NN(特朗普)) $\to$ Trump};
+
+\end{scope}
+}
+}
+
+\end{tikzpicture}
+\end{center}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%%  tree binarization (cont.)
+\begin{frame}{更多的规则 - 句法树二叉化(续)}
+\begin{itemize}
+\item 一种解决问题的思路是用二叉化方法把树结构变得更深
+
+
+\vspace{-1.5em}
+\begin{center}
+\begin{tikzpicture}
+
+{\scriptsize
+\begin{scope}[sibling distance=4pt, level distance=25pt]
+
+\Tree[.\node(n1){NP};
+     	[.NNP \node(sw1){美国}; ]
+     	[.NN \node(sw2){总统}; ]
+        [.NN \node(sw3){唐纳德}; ]
+        [.NN \node(sw4){特朗普}; ]
+     ]
+
+\node [anchor=north] (tw1) at ([yshift=-2em]sw1.south) {U.S.};
+\node [anchor=north] (tw2) at ([yshift=-2em]sw2.south) {President};
+\node [anchor=north] (tw3) at ([yshift=-2em]sw3.south) {Trump};
+
+\draw [-,dashed] (sw1.south) -- (tw1.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);
+
+\draw [->,very thick] ([xshift=1em]sw4.east) -- ([xshift=5em]sw4.east) node [pos=0.5,above] {\tiny{二叉化}};
+
+\end{scope}
+
+\begin{scope}[xshift=2.2in,sibling distance=10pt, level distance=15pt]
+
+\Tree[.\node(n1){NP};
+     	[.NNP \node(sw1){美国}; ]
+	[.NP-BAR
+     	    [.NN \node(sw2){总统}; ]
+	    [.NP-BAR
+                [.NN \node(sw3){唐纳德}; ]
+                [.NN \node(sw4){特朗普}; ]
+             ]
+         ]
+     ]
+
+\node [anchor=north] (tw1) at ([yshift=-4.5em]sw1.south) {U.S.};
+\node [anchor=north] (tw2) at ([yshift=-2.75em]sw2.south) {President};
+\node [anchor=north] (tw3) at ([yshift=-1em]sw3.south) {Trump};
+
+\draw [-,dashed] (sw1.south) -- (tw1.north);
+\draw [-,dashed] (sw2.south) -- (tw2.north);
+\draw [-,dashed] (sw3.south) -- (tw3.north);
+\draw [-,dashed] (sw4.south) -- (tw3.north);
+
+\end{scope}
+
+}
+
+\end{tikzpicture}
+\end{center}
+
+\visible<2->{
+\small{二叉化增加了更多的可信节点，这也带来了新的规则}
+
+\begin{center}
+{\footnotesize
+\vspace{0.3em}
+NP-BAR(NN(唐纳德) NN(特朗普)) $\to$ Trump \\
+\vspace{0.3em}
+NP-BAR(NN$_1$ NP-BAR$_2$) $\to$ NN$_1$ NP-BAR$_2$
+\vspace{0.3em}
+}
+\end{center}
+}
+
+\item<3-> 树二叉化已经成为基于句法机器翻译模型的常用方法
+	\begin{itemize}
+	\item 有很多策略：左优先、右优先、head优先等等
+	\item 二叉化可以得到更多(细粒度)规则，保证规则的覆盖度
+	\end{itemize}
+
+\end{itemize}
+
+\end{frame}
+

 %%%------------------------------------------------------------------------------------------------------------
 %%%  基于树结构的翻译文法 - 树到树