new pages

7c79a698 · xiaotong · 0c5005e4 · 7c79a698 · 7c79a698
Commit 7c79a698 authored Oct 25, 2019 by xiaotong
--- a/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05-test.tex
@@ -19,6 +19,7 @@
 \usepackage{pgfplots}
 \usepackage{subfigure}
 \usepackage{tikz-3dplot}
+\usepackage{esvect}
 \usepackage{tcolorbox}
 \tcbuselibrary{skins}
@@ -116,29 +117,20 @@
 \section{神经语言模型}
 %%%------------------------------------------------------------------------------------------------------------
-\subsection{前馈、循环、自注意力神经网络}
+\subsection{词嵌入}
 %%%------------------------------------------------------------------------------------------------------------
-%%% 循环单元的设计、梯度消失、训练等问题
+%%% 用实例理解词的分布式表示
-\begin{frame}{进一步的问题}
+\begin{frame}{分布式表示的可视化}
 \begin{itemize}
-\item \textbf{循环单元设计}：循环单元就是一个函数，入读当前时刻的输入和上一时刻的状态，生成当前时刻的状态
+\item \textbf{一个著名的例子}：国王 $\to$ 王后\\
    \begin{displaymath}
-    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
+    \vv{\textrm{国王}} - \vv{\textrm{男人}} + \vv{\textrm{女人}} = \vv{\textrm{王后}}
    \end{displaymath}
-    很多种方式设计$g(\cdot)$，如著名的LSTM、GRU等
+    这里，$\vv{\textrm{word}}$表示单词的分布式向量表示
-\item<2-> \textbf{梯度消失/爆炸}：随着序列变长，在反向传播时循环神经网络会产生更多的局部梯度相乘计算，这会导致\alert{梯度消失/爆炸问题}
+\item 更多的词的可视化：相似的词聚在一起
-    \begin{displaymath}
-    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
-    \end{displaymath}
-    \vspace{-0.8em}
-    \begin{itemize}
-    \item 可以考虑梯度裁剪，限制梯度的大小
-    \item 也可以引入short-cut connection，如残差网络
-    \end{itemize}
-\item<2-> \textbf{训练}：有了自动微分，这不是个大问题 :)
 \end{itemize}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------

--- a/Section05-Neural-Networks-and-Language-Modeling/section05.tex
+++ b/Section05-Neural-Networks-and-Language-Modeling/section05.tex
@@ -19,6 +19,7 @@
 \usepackage{pgfplots}
 \usepackage{subfigure}
 \usepackage{tikz-3dplot}
+\usepackage{esvect}
 \usepackage{tcolorbox}
 \tcbuselibrary{skins}
@@ -4366,11 +4367,95 @@ $\textbf{V}, \textbf{U}, \textbf{W}$: 参数
 \end{itemize}
 \end{frame}
 %%%------------------------------------------------------------------------------------------------------------
 \subsection{词嵌入}
 %%%------------------------------------------------------------------------------------------------------------
+%%% 词的one-hot和distributed表示
+\begin{frame}{单词的表示}
+\begin{itemize}
+\item 如何表示一个单词？
+    \begin{itemize}
+    \item \textbf{One-hot}: 假如有一个词典$V$，里面包含10k个单词，并进行编号。每个单词都可以表示为10k维的one-hot向量，仅在编号那个维度为1，其它为0
+    \item<2-> \textbf{Distributed}: 类似于神经语言模型，每个单词可以被表示为一个实数向量，每一维都对应一种``属性'' - \alert{词嵌入}
+    \end{itemize}
+\end{itemize}
+\begin{center}
+\begin{tikzpicture}
+\begin{scope}
+\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} 0 \\ 1 \\ 0 \\ 0 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
+\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 0 \\ 0 \\ 0 \\ 1 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
+\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ \ \ 你}_1 \\ \textrm{\ \ 桌子}_2 \\ \textrm{\ \ \ \ \ 他}_3 \\ \textrm{\ \ 椅子}_4 \\ \textrm{\ \ 我们}_5 \\ ... \\ \textrm{你好}_{10k} \end{matrix}$}};
+\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
+\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
+\node [anchor=north] (label) at (o1.south) {\footnotesize{单词的one-hot表示}};
+\visible<3->{
+\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$cosine(\textrm{`桌子'},\textrm{`椅子'})=0$}};
+}
+\end{scope}
+\visible<2->{
+\begin{scope}[xshift=2in]
+\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} .1 \\ -1 \\ 2 \\ ... \\ 0 \end{bmatrix}$}};
+\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 1 \\ 2 \\ .2 \\ ... \\ -1 \end{bmatrix}$}};
+\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ 属性}_1 \\ \textrm{\ \ \ 属性}_2 \\ \textrm{\ \ \ 属性}_3 \\ ... \\ \textrm{属性}_{512} \end{matrix}$}};
+\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
+\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
+\node [anchor=north] (label) at ([yshift=-2em]o1.south) {\footnotesize{单词的分布式表示(词嵌入)}};
+\visible<3->{
+\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$cosine(\textrm{`桌子'},\textrm{`椅子'})=0.5$}};
+}
+\end{scope}
+}
+\end{tikzpicture}
+\end{center}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 分布式表示的优点
+\begin{frame}{为什么需要分布式表示？}
+\begin{itemize}
+\item \textbf{一个自然的问题}：分布式表示中每一维都是什么意思
+    \begin{itemize}
+    \item 可以把每一维都理解为一个属性，比如：性别、身高等
+    \item 但是，模型更多的是把一个维度看做是事物的一种``刻画''，是一种统计意义上的``语义''，而非人工归纳的属性
+    \end{itemize}
+\item<2-> 那这种方法有什么好处？
+    \begin{itemize}
+    \item 更容易刻画词语之间的\alert{相似性}
+    \item 连续空间表示模型可以更准确的刻画客观事物，而不是非零即一的判断
+    \end{itemize}
+\item<2-> 预测下一个词任务
+    \begin{itemize}
+    \item 分布式表示很容易指导``桌子''和``椅子''是相似的
+    \item 即使``椅子''没在这个句型中出现过，系统仍然可以通过它和``桌子''的相似性进行预测
+    \end{itemize}
+    \begin{tabular}{l | l}
+    屋里 要 摆放 一个 \_\_\_\_\_ & 预测下个词 \\ \hline
+    屋里 要 摆放 一个 \alert{桌子} & 见过 \\
+    屋里 要 摆放 一个 \blue{椅子} & 没见过，但是仍然是合理预测
+    \end{tabular}
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
+%%% 用实例理解词的分布式表示
+\begin{frame}{分布式表示的可视化}
+\begin{itemize}
+\item \textbf{一个著名的例子}：国王 $\to$ 王后\\
+    \begin{displaymath}
+    \vv{\textrm{国王}} - \vv{\textrm{男人}} + \vv{\textrm{女人}} = \vv{\textrm{王后}}
+    \end{displaymath}
+    这里，$\vv{\textrm{word}}$表示单词的分布式向量表示
+\item 更多的词的可视化：相似的词聚在一起
+\end{itemize}
+\end{frame}
+%%%------------------------------------------------------------------------------------------------------------
 \subsection{句子表示模型及预训练}
 \end{CJK}