new pages

a10f27e4 · xiaotong · c7c13816 · a10f27e4 · a10f27e4
Commit a10f27e4 authored Oct 22, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -118,31 +118,79 @@
 %%%------------------------------------------------------------------------------------------------------------
 \subsection{前馈、循环、自注意力神经网络}
 %%%------------------------------------------------------------------------------------------------------------
-%%% n-gram语言模型
+%%% FNNLM implementation
-\begin{frame}{$n$-gram语言模型}
+\begin{frame}{前馈神经网络语言模型的实现}
 \begin{itemize}
-\item \textbf{链式法则}
+\item 实现非常简单，几行代码
-\begin{eqnarray}
+    \begin{itemize}
-\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
+    \item 细节1：做batching时可以把$w[i]$进行扩展，比如放入多个词
-                                               &    & \textrm{P}(w_m|w_1...w_{n-1}) \nonumber
+    \item 细节2：TanH一般会用HardTanH实现，因为TanH容易溢出
-\end{eqnarray}
+    \end{itemize}
-\item \textbf{传统$n$-gram语言模型}：当前词仅依赖于前面$n-1$个词
-\begin{eqnarray}
-\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
-                                               &    & \textrm{P}(w_m|\underbrace{w_{m-n+1}...w_{m-1}}_{\text{前面$n-1$个词}}) \nonumber
-\end{eqnarray}
-\ \ \ \ \ \ 其中
-\begin{displaymath}
-\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})  = \frac{\textrm{count}(w_{m-n+1}...w_{m})}{\textrm{count}(w_{m-n+1}...w_{m-1})} 
-\end{displaymath}
-\ \ \ \ \ \ $\textrm{count}(\cdot)$表示在训练数据上统计的频次
 \end{itemize}
-\end{frame}
-%%%------------------------------------------------------------------------------------------------------------
+\begin{tcolorbox}
-%%% 前馈神经网络语言模型
+[bicolor,sidebyside,righthand width=3.8cm,size=title,frame engine=empty,
-\begin{frame}{$n$-gram生成概率的神经网络建模}
+ colback=blue!10!white,colbacklower=black!5!white]
+ {\scriptsize
+\begin{tabbing}
+\texttt{XTensor w[3], e[3], h0, y;} \\
+\texttt{XTensor C, H, d, U;} \\
+\texttt{...}\\
+\texttt{} \\
+\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
+\texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
+\texttt{\}}\\
+\texttt{e01 = Concatenate(e[0], e[1], -1);}\\
+\texttt{e = Concatenate(e01, e[2], -1);}\\
+\texttt{} \\
+\texttt{h0 = TanH(MMul(e, H) + d);}\\
+\texttt{y = Softmax(MMul(h0, U));}\\
+\texttt{} \\
+\texttt{for(unsigned k = 0; k < size; k++)\{} \\
+\texttt{} \ \ \ \ ... // \alert{\texttt{y}}的第$k$元素表示 $\textrm{P}(w|...)$\\
+\texttt{} \ \ \ \ ... // $w$为词汇表里第$k$个词\\
+\texttt{\}}
+\end{tabbing}
+}
+\tcblower
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,0) {\scriptsize{$w_{i-3}$}};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {\scriptsize{$w_{i-2}$}};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {\scriptsize{$w_{i-1}$}};
+\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
+\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
+\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e0) at ([yshift=1.0em]w0.north) {\tiny{$e_0:$}\\\tiny{$w_{i-3} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e1) at ([yshift=1.0em]w1.north) {\tiny{$e_1:$}\\\tiny{$w_{i-2} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e2) at ([yshift=1.0em]w2.north) {\tiny{$e_2:$}\\\tiny{$w_{i-1} \textbf{C}$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
+\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\scriptsize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};
+\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
+\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
+\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
+\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\end{tcolorbox}
+\vspace{-0.5em}
+\footnotesize{注: size表示词汇表大小}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------

--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -3750,6 +3750,28 @@ $+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 \section{神经语言模型}
 %%%------------------------------------------------------------------------------------------------------------
+%%% outline: neural language modeling
+\begin{frame}{进入正题}
+\vspace{6em}
+\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
+{\Large
+\textbf{如何将神经元网络应用到NLP？}
+\vspace{0.4em}
+\textbf{- 语言模型的神经网络建模}
+}
+\end{tcolorbox}
+\vspace{2em}
+\begin{center}
+\begin{tikzpicture}
+\end{tikzpicture}
+\end{center}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{前馈、循环、自注意力神经网络}
 %%%------------------------------------------------------------------------------------------------------------
@@ -3793,6 +3815,236 @@ NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
+%%% n-gram语言模型
+\begin{frame}{$n$-gram语言模型}
+\begin{itemize}
+\item \textbf{链式法则}
+\begin{eqnarray}
+\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
+                                               &    & \textrm{P}(w_m|w_1...w_{m-1}) \nonumber
+\end{eqnarray}
+\item<2-> \textbf{传统$n$-gram语言模型}：当前词仅依赖于前面$n-1$个词
+\begin{eqnarray}
+\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
+                                               &    & \textrm{P}(w_m|\underbrace{w_{m-n+1}...w_{m-1}}_{\text{前面$n-1$个词}}) \nonumber
+\end{eqnarray}
+\vspace{-1.0em}
+\ \ \ \ \ \ 其中
+\begin{displaymath}
+\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})  = \frac{\textrm{count}(w_{m-n+1}...w_{m})}{\textrm{count}(w_{m-n+1}...w_{m-1})}
+\end{displaymath}
+\ \ \ \ \ \ $\textrm{count}(\cdot)$表示在训练数据上统计的频次
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% n-gram lm => neural lm
+\begin{frame}{$n$-gram生成概率的神经网络建模}
+\begin{itemize}
+\item 传统的$n$-gram语言模型实际上就是一个查询表，用$w_{m-n+1} ... w_{m}$查询$n$-gram概率$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$
+    \begin{itemize}
+    \item 这张表本质上是一种$w_{m-n+1} ... w_{m}$的\alert{离散表示}
+    \item 随着$n$的增大，\alert{数据稀疏}问题会非常严重，因为绝大多数$n$-gram是没见过的
+    \item 因为要维护$n$-gram的索引，存储消耗大
+    \end{itemize}
+\item<2-> 另一种思路是直接对$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$进行连续空间建模，即定义函数$g$使得对于任意的$w_{m-n+1} ... w_{m}$
+    \begin{displaymath}
+    g(w_{m-n+1} ... w_{m}) \approx \textrm{P}(w_m | w_{m-n+1} ... w_{m-1})
+    \end{displaymath}
+\item<3-> 最具代表性的方法是基于前馈神经网络的语言模型
+    \begin{itemize}
+    \item 经典中的经典，对现代神经语言模型的设计产生深远影响
+    \end{itemize}
+    \textbf{A Neural Probabilistic Language Model}\\
+    \textbf{Bengio et al., 2003, Journal of Machine Learning Research 3: 1137-1155}
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% FNNLM architecture
+\begin{frame}{前馈神经网络语言模型(Bengio et al., 2003)}
+\begin{itemize}
+\item 以4-gram语言模型为例
+\end{itemize}
+\vspace{-1em}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{i-3}$}};
+\node [anchor=west] (w1) at ([xshift=2em]w0.east) {\footnotesize{$w_{i-2}$}};
+\node [anchor=west] (w2) at ([xshift=2em]w1.east) {\footnotesize{$w_{i-1}$}};
+\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
+\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
+\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
+\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$e_0=w_{i-3} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$e_1=w_{i-2} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$e_2=w_{i-1} \textbf{C}$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
+\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\footnotesize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};
+\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
+\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
+\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
+\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
+\visible<6->{
+\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e1.north) -- ([xshift=1em,yshift=-0.1em]h1.south);
+\draw [->,dashed,red,thick] ([xshift=-1em,yshift=0.1em]e0.north) .. controls +(north:2) and +(south:1) .. ([xshift=-3em,yshift=-0.1em]h1.south);
+\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e2.north) .. controls +(north:2) and +(south:1) .. ([xshift=3em,yshift=-0.1em]h1.south);
+}
+\begin{pgfonlayer}{background}
+\visible<2->{
+\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w0) (index0)] (wordbox0) {};
+\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w1) (index1)] (wordbox1) {};
+\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w2) (index2)] (wordbox2) {};
+}
+\end{pgfonlayer}
+\visible<3->{
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$e_0=w_{i-3} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$e_1=w_{i-2} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$e_2=w_{i-1} \textbf{C}$}};
+}
+\visible<5->{
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
+}
+\visible<2->{
+\node [anchor=north west] (indexlabel0) at ([yshift=-0.5em,xshift=-1.2em]index0.south west) {\scriptsize{{\color{ugreen} \textbf{One-hot表示}}}};
+\node [anchor=north west] (indexlabel1) at ([yshift=0.3em]indexlabel0.south west) {\scriptsize{每个词用一个词汇表大小的0-1向量表示，}};
+\node [anchor=north west] (indexlabel2) at ([yshift=0.3em]indexlabel1.south west) {\scriptsize{仅一位为1，其余为0，比如：}};
+\node [anchor=north west] (indexlabel3) at ([yshift=0.0em]indexlabel2.south west) {\scriptsize{$(0,0,{\red 1},0,0,0,0,0,0,0,0,0)$}};
+\node [anchor=north west] (indexlabel4) at ([xshift=1em,yshift=0.0em]indexlabel3.south west) {\scriptsize{词表中第3个词}};
+\draw [->] ([xshift=1.2em,yshift=-0.2em]indexlabel4.north west) -- ([xshift=1.2em,yshift=0.3em]indexlabel4.north west);
+}
+\visible<3->{
+\node [anchor=west] (embedinglabel0) at ([xshift=1em,yshift=-1em]e2.east) {\scriptsize{{\blue \textbf{词的分布式表示}}}};
+\node [anchor=north west] (embedinglabel1) at ([yshift=0.3em]embedinglabel0.south west) {\scriptsize{词的0-1表示乘一个矩阵$\textbf{C}$，这里可以}};
+\node [anchor=north west] (embedinglabel2) at ([yshift=0.3em]embedinglabel1.south west) {\scriptsize{把$\textbf{C}$看做一个查询表}};
+}
+\visible<4->{
+\node [anchor=north west] (wordvector) at ([yshift=-1em]embedinglabel2.south west) {\tiny{$(0,0,{\red 1},...)$}};
+\node [anchor=west] (timeslabel) at ([xshift=-0.3em]wordvector.east) {\footnotesize{$\times$}};
+\node [anchor=north west,inner sep=2pt] (embeddingmatrix) at ([xshift=1em]wordvector.north east) {\tiny{$\begin{pmatrix} 0 & 1 & 3 \\ .2 & -1 & .3 \\ 1 & 7 & .3 \\ ... \end{pmatrix}$}};
+\node [anchor=south,inner sep=1pt] (wordvectorlabel) at (wordvector.north) {\scriptsize{$w_{i-1}$}};
+\node [anchor=south,inner sep=1pt] (embeddingmatrixlabel) at (embeddingmatrix.north) {\scriptsize{$\textbf{C}$}};
+\node [anchor=north west] (selectedlabel) at ([yshift=-2em]wordvector.south west) {\scriptsize{在把$\textbf{C}$中索引到的行输出(i.e., $e_{i-1}$)}};
+\begin{pgfonlayer}{background}
+\visible<4->{
+\node [anchor=north west,fill=blue!20!white,minimum height=0.6em,minimum width=5.0em] (selected) at ([yshift=-1.3em]embeddingmatrix.north west) {};
+}
+\end{pgfonlayer}
+\draw [->] ([xshift=0.15em,yshift=0.3em]wordvector.south) .. controls +(south:0.3) and +(west:0.5) .. (selected.west);
+}
+\visible<5->{
+\node [anchor=south west] (hiddenlabel0) at ([yshift=5em]embedinglabel0.north west) {\scriptsize{{\color{orange} \textbf{多层神经网络}}}};
+\node [anchor=north west] (hiddenlabel1) at ([yshift=0.3em]hiddenlabel0.south west) {\scriptsize{$[e_0,e_1,e_2]$表示把三个向量级联在一起，}};
+\node [anchor=north west] (hiddenlabel2) at ([yshift=0.3em]hiddenlabel1.south west) {\scriptsize{之后经过两层网络，最后通过Softmax输出}};
+\node [anchor=north west] (hiddenlabel3) at ([yshift=0.3em]hiddenlabel2.south west) {\scriptsize{注意，$h_0\textbf{U}$得到所有词的表示(向量)，}};
+\node [anchor=north west] (hiddenlabel4) at ([yshift=0.3em]hiddenlabel3.south west) {\scriptsize{Softmax确保输出词汇表上的一个分布}};
+}
+\visible<6->{
+\node [anchor=south west] (directlabel0) at ([yshift=1em]hiddenlabel0.north west) {\scriptsize{\alert{\textbf{底层向上层的直接连接(可选)}}}};
+}
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% FNNLM implementation
+\begin{frame}{前馈神经网络语言模型的实现}
+\begin{itemize}
+\item 实现非常简单，几行代码
+    \begin{itemize}
+    \item 细节1：做batching时可以把$w[i]$进行扩展，比如放入多个词
+    \item 细节2：TanH一般会用HardTanH实现，因为TanH容易溢出
+    \end{itemize}
+\end{itemize}
+\begin{tcolorbox}
+[bicolor,sidebyside,righthand width=3.8cm,size=title,frame engine=empty,
+ colback=blue!10!white,colbacklower=black!5!white]
+ {\scriptsize
+\begin{tabbing}
+\texttt{XTensor w[3], e[3], h0, y;} \\
+\texttt{XTensor C, H, d, U;} \\
+\texttt{...}\\
+\texttt{} \\
+\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
+\texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
+\texttt{\}}\\
+\texttt{e01 = Concatenate(e[0], e[1], -1);}\\
+\texttt{e = Concatenate(e01, e[2], -1);}\\
+\texttt{} \\
+\texttt{h0 = TanH(MMul(e, H) + d);}\\
+\texttt{y = Softmax(MMul(h0, U));}\\
+\texttt{} \\
+\texttt{for(unsigned k = 0; k < size; k++)\{} \\
+\texttt{} \ \ \ \ ... // \alert{\texttt{y}}的第$k$元素表示 $\textrm{P}(w|...)$\\
+\texttt{} \ \ \ \ ... // $w$为词汇表里第$k$个词\\
+\texttt{\}}
+\end{tabbing}
+}
+\tcblower
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=west] (w0) at (0,0) {\scriptsize{$w_{i-3}$}};
+\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {\scriptsize{$w_{i-2}$}};
+\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {\scriptsize{$w_{i-1}$}};
+\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
+\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
+\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e0) at ([yshift=1.0em]w0.north) {\tiny{$e_0:$}\\\tiny{$w_{i-3} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e1) at ([yshift=1.0em]w1.north) {\tiny{$e_1:$}\\\tiny{$w_{i-2} \textbf{C}$}};
+\node [anchor=south,draw,inner sep=3pt,align=left] (e2) at ([yshift=1.0em]w2.north) {\tiny{$e_2:$}\\\tiny{$w_{i-1} \textbf{C}$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
+\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
+\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\scriptsize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};
+\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
+\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
+\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
+\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
+\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
+\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
+\end{scope}
+\end{tikzpicture}
+\end{center}
+\end{tcolorbox}
+\vspace{-0.5em}
+\footnotesize{注: size表示词汇表大小}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{词嵌入}
 %%%------------------------------------------------------------------------------------------------------------