FFN pages

a1c93eed · xiaotong · a10f27e4 · a1c93eed · a1c93eed
Commit a1c93eed authored Oct 22, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -120,76 +120,42 @@


 %%%------------------------------------------------------------------------------------------------------------
-%%% FNNLM implementation
-\begin{frame}{前馈神经网络语言模型的实现}
+%%% 循环神经网络
+\begin{frame}{循环神经网络(Recurrent Neural Networks)}

 \begin{itemize}
-\item 实现非常简单，几行代码
-    \begin{itemize}
-    \item 细节1：做batching时可以把$w[i]$进行扩展，比如放入多个词
-    \item 细节2：TanH一般会用HardTanH实现，因为TanH容易溢出
-    \end{itemize}
-\end{itemize}
-
-\begin{tcolorbox}
-[bicolor,sidebyside,righthand width=3.8cm,size=title,frame engine=empty,
- colback=blue!10!white,colbacklower=black!5!white]
- {\scriptsize
-\begin{tabbing}
-\texttt{XTensor w[3], e[3], h0, y;} \\
-\texttt{XTensor C, H, d, U;} \\
-\texttt{...}\\
-
-\texttt{} \\
-\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
-\texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
-\texttt{\}}\\
-\texttt{e01 = Concatenate(e[0], e[1], -1);}\\
-\texttt{e = Concatenate(e01, e[2], -1);}\\
-
-\texttt{} \\
-\texttt{h0 = TanH(MMul(e, H) + d);}\\
-\texttt{y = Softmax(MMul(h0, U));}\\
-
-\texttt{} \\
-\texttt{for(unsigned k = 0; k < size; k++)\{} \\
-\texttt{} \ \ \ \ ... // \alert{\texttt{y}}的第$k$元素表示 $\textrm{P}(w|...)$\\
-\texttt{} \ \ \ \ ... // $w$为词汇表里第$k$个词\\
-\texttt{\}}
-
-\end{tabbing}
-}
-\tcblower
+\item FNN LM固然有效，但是和传统的$n$-gram LM一样，需要依赖\alert{有限上下文}假设
 \begin{center}
 \begin{tikzpicture}
 \begin{scope}
-\node [anchor=west] (w0) at (0,0) {\scriptsize{$w_{i-3}$}};
-\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {\scriptsize{$w_{i-2}$}};
-\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {\scriptsize{$w_{i-1}$}};
-\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
-\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
-\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
-\node [anchor=south,draw,inner sep=3pt,align=left] (e0) at ([yshift=1.0em]w0.north) {\tiny{$e_0:$}\\\tiny{$w_{i-3} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt,align=left] (e1) at ([yshift=1.0em]w1.north) {\tiny{$e_1:$}\\\tiny{$w_{i-2} \textbf{C}$}};
-\node [anchor=south,draw,inner sep=3pt,align=left] (e2) at ([yshift=1.0em]w2.north) {\tiny{$e_2:$}\\\tiny{$w_{i-1} \textbf{C}$}};
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
-\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
-\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\scriptsize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};
-
-\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
-\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
-\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
-\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
-\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
-\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
-\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
-\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
+\node [anchor=west] (w0) at (0,0) {$w_1$};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$...$};
+\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$w_{m-n+1}$};
+\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$...$};
+\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
+\draw [->,thick,ublue] (w5.south).. controls +(210:0.5) and +(-30:0.5) .. (w3.south);
+\draw [->,thick,red] (w5.north).. controls +(150:1) and +(30:1) .. (w1.north);
+\draw [->,very thick,ublue] ([xshift=-5em,yshift=1em]w0.west) -- ([xshift=-6.5em,yshift=1em]w0.west) node [pos=0,right] {\scriptsize{依赖}};
+\draw [->,very thick,red] ([xshift=-5em,yshift=-0.5em]w0.west) -- ([xshift=-6.5em,yshift=-0.5em]w0.west) node [pos=0,right] {\scriptsize{不依赖}};
+
 \end{scope}
 \end{tikzpicture}
 \end{center}
-\end{tcolorbox}
-\vspace{-0.5em}
-\footnotesize{注: size表示词汇表大小}
+\item<2-> 能否直接对原始问题建模，即定义函数$g$，对于任意的$w_{1} ... w_{m}$有
+    \vspace{-0.5em}
+    \begin{displaymath}
+    g(w_{1} ... w_{m}) \approx \textrm{P}(w_m | w_{1} ... w_{m-1})
+    \end{displaymath}
+\item<3-> \textbf{循环神经网络(RNNs)}可以很好的解决上述问题，因此也被成功的应用于语言建模任务
+	\begin{itemize}
+	\item 它假设每个词的生成都依赖已经生成的所有词
+	\item 对于不同位置的词的生成概率都可以用同一个函数描述
+	\end{itemize}
+	
+        \textbf{Recurrent Neural Network Based Language Model}\\
+        \textbf{Mikolov et al., 2010, In Proc. of Interspeech, 1045-1048}
+\end{itemize}

 \end{frame}


--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -3847,14 +3847,14 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到
    \item 随着$n$的增大，\alert{数据稀疏}问题会非常严重，因为绝大多数$n$-gram是没见过的
    \item 因为要维护$n$-gram的索引，存储消耗大
    \end{itemize}
-\item<2-> 另一种思路是直接对$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$进行连续空间建模，即定义函数$g$使得对于任意的$w_{m-n+1} ... w_{m}$
+\item<2-> 另一种思路是直接对$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$进行连续空间建模，即定义函数$g$，对于任意的$w_{m-n+1} ... w_{m}$有
    \begin{displaymath}
    g(w_{m-n+1} ... w_{m}) \approx \textrm{P}(w_m | w_{m-n+1} ... w_{m-1})
    \end{displaymath}



-\item<3-> 最具代表性的方法是基于前馈神经网络的语言模型
+\item<3-> 最具代表性的方法是前馈神经网络(FFN)语言模型
    \begin{itemize}
    \item 经典中的经典，对现代神经语言模型的设计产生深远影响
    \end{itemize}
@@ -3972,7 +3972,7 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到

 %%%------------------------------------------------------------------------------------------------------------
 %%% FNNLM implementation
-\begin{frame}{前馈神经网络语言模型的实现}
+\begin{frame}{前馈神经网络语言模型(FFN LM)的实现}

 \begin{itemize}
 \item 实现非常简单，几行代码
@@ -3992,9 +3992,8 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到
 \texttt{...}\\

 \texttt{} \\
-\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
+\texttt{for(unsigned i = 0; i < 3; i++)} \\
 \texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
-\texttt{\}}\\
 \texttt{e01 = Concatenate(e[0], e[1], -1);}\\
 \texttt{e = Concatenate(e01, e[2], -1);}\\

@@ -4045,6 +4044,73 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到
 \end{frame}

 %%%------------------------------------------------------------------------------------------------------------
+%%% 神经语言模型给我们带来了什么
+\begin{frame}{神经语言建模的意义}
+
+\begin{itemize}
+\item Bengio el al. (2003)中有待讨论的问题
+    \begin{enumerate}
+    \item 神经网络每一层究竟学到了什么 \\
+    词汇、句法？还是其它一些知识？如何解释？
+    \item 网络的层数变多会怎样 - 10层、20层、100层的网络 \\
+    \# of layers: 10 $\to$ 20 $\to$ 100 $\to$ 1000
+    \item 超参(比如隐藏层大小)如何选择 - 不同任务的最优设置\\
+    单词的分布式表示维度多大好？\\
+    隐层多大好？\\
+    激活函数如何选择？\\
+    ...
+    \end{enumerate}
+\item<2-> 从FFN LM得到的启发
+    \begin{itemize}
+    \item 重新定义词是什么 - 非词典里的一项，而是一个实数向量
+    \item 多层神经网络可以很好的表示单词之间的(短距离)依赖
+    \item $n$-gram的生成概率可以使用连续空间函数描述，缓解数据稀疏问题，模型并不需要记录完整的$n$-gram
+    \end{itemize}
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
+%%% 循环神经网络
+\begin{frame}{循环神经网络(Recurrent Neural Networks)}
+
+\begin{itemize}
+\item FNN LM固然有效，但是和传统的$n$-gram LM一样，需要依赖\alert{有限上下文}假设
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,0) {$w_1$};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$...$};
+\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$w_{m-n+1}$};
+\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$...$};
+\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
+\draw [->,thick,ublue] (w5.south).. controls +(210:0.5) and +(-30:0.5) .. (w3.south);
+\draw [->,thick,red] (w5.north).. controls +(150:1) and +(30:1) .. (w1.north);
+\draw [->,very thick,ublue] ([xshift=-5em,yshift=1em]w0.west) -- ([xshift=-6.5em,yshift=1em]w0.west) node [pos=0,right] {\scriptsize{依赖}};
+\draw [->,very thick,red] ([xshift=-5em,yshift=-0.5em]w0.west) -- ([xshift=-6.5em,yshift=-0.5em]w0.west) node [pos=0,right] {\scriptsize{不依赖}};
+
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\item<2-> 能否直接对原始问题建模，即定义函数$g$，对于任意的$w_{1} ... w_{m}$有
+    \vspace{-0.5em}
+    \begin{displaymath}
+    g(w_{1} ... w_{m}) \approx \textrm{P}(w_m | w_{1} ... w_{m-1})
+    \end{displaymath}
+\item<3-> \textbf{循环神经网络(RNNs)}可以很好的解决上述问题，因此也被成功的应用于语言建模任务
+	\begin{itemize}
+	\item 它假设每个词的生成都依赖已经生成的所有词
+	\item 对于不同位置的词的生成概率都可以用同一个函数描述
+	\end{itemize}
+	
+        \textbf{Recurrent Neural Network Based Language Model}\\
+        \textbf{Mikolov et al., 2010, In Proc. of Interspeech, 1045-1048}
+\end{itemize}
+
+\end{frame}
+
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{词嵌入}

 %%%------------------------------------------------------------------------------------------------------------