% !Mode:: "TeX:UTF-8"
% !TEX encoding = UTF-8 Unicode

\def\CTeXPreproc{Created by ctex v0.2.13, don't edit!}
\documentclass[cjk,t,compress,12pt]{beamer}
\usepackage{pstricks}
\usepackage{etex}
\usepackage{eso-pic,graphicx}
\usepackage{fancybox}
\usepackage{amsmath,amssymb}
\usepackage{setspace}
\usepackage{xcolor}
\usepackage{array,multirow}
\usepackage{CJK}
\usepackage{tikz}
\usepackage{tikz-qtree}
\usepackage{hyperref}
\usepackage{changepage}
\usepackage{pgfplots}
\usepackage{subfigure}
\usepackage{tikz-3dplot}
\usepackage{esvect}

\usepackage{tcolorbox}
\tcbuselibrary{skins}

\usetikzlibrary{calc,intersections}
\usetikzlibrary{matrix}
\usetikzlibrary{arrows,decorations.pathreplacing}
\usetikzlibrary{shadows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{shadows.blur}

\usepgflibrary{arrows} % LATEX and plain TEX and pure pgf
\usetikzlibrary{arrows} % LATEX and plain TEX when using Tik Z
\usetikzlibrary{decorations}
\usetikzlibrary{arrows,shapes}

\usetikzlibrary{positioning,fit,calc}

\usetikzlibrary{mindmap,backgrounds} % mind map

\DeclareMathOperator*{\argmax}{arg\,max}
\DeclareMathOperator*{\argmin}{arg\,min}

\setbeamertemplate{items}[ball]
\usefonttheme[onlymath]{serif}  % fout of math

\definecolor{ugreen}{rgb}{0,0.5,0}
\definecolor{lgreen}{rgb}{0.9,1,0.8}
\definecolor{xtgreen1}{rgb}{0.824,0.898,0.8}
\definecolor{xtgreen}{rgb}{0.914,0.945,0.902}
\definecolor{lightgray}{gray}{0.85}

\setbeamercolor{uppercol}{fg=white,bg=ugreen}
\setbeamercolor{lowercol}{fg=black,bg=xtgreen}

\definecolor{ublue}{rgb}{0.152,0.250,0.545}
\setbeamercolor{uppercolblue}{fg=white,bg=ublue}
\setbeamercolor{lowercolblue}{fg=black,bg=blue!10}


%\usetheme{default}
%\usetheme{Darmstadt}
%\usetheme{Madrid}
%\usetheme{Frankfurt}
%\usetheme{Dresden}
%\usetheme{Boadilla}
%\usecolortheme{dolphin}

\newcounter{mycount1}
\newcounter{mycount2}
\newcounter{mycount3}
\newcounter{mycount4}

\usefonttheme[onlylarge]{structurebold}

\IfFileExists{C:/WINDOWS/win.ini}
{\newcommand{\mycfont}{you}}
{\newcommand{\mycfont}{gbsn}}

\begin{CJK}{UTF8}{\mycfont}
\end{CJK}

\setbeamerfont*{frametitle}{size=\large,series=\bfseries}
\setbeamertemplate{navigation symbols}{\begin{CJK}{UTF8}{\mycfont} 第五章 神经网络和语言模型 \hspace*{2em} 肖桐\&朱靖波 \end{CJK} \hspace*{2em} \today \hspace*{2em} \insertframenumber{}/\inserttotalframenumber}

\setbeamertemplate{itemize items}[circle] % if you want a circle
\setbeamertemplate{itemize subitem}[triangle] % if you wnat a triangle
\setbeamertemplate{itemize subsubitem}[ball] % if you want a ball

\begin{document}

\begin{CJK}{UTF8}{\mycfont}

\title{\Large{神经网络和语言模型}}
\author{\large{\textbf{肖桐\ \ 朱靖波}}}
\institute{
\blue{\url{xiaotong@mail.neu.edu.cn}} \black{} \\
\blue{\url{zhujingbo@mail.neu.edu.cn}} \black{} \\
\vspace{1.0em}
东北大学 自然语言处理实验室 \\
\blue{\underline{\url{http://www.nlplab.com}}} \black{} \\
\vspace{0.2cm}
\hspace{0.1cm} \includegraphics[scale=0.1]{../Figures/logo.pdf}
}
\date{}

\maketitle

\setlength{\leftmargini}{1em}
\setlength{\leftmarginii}{1em}

%%%------------------------------------------------------------------------------------------------------------
\section{为什么要谈神经网络}

%%%------------------------------------------------------------------------------------------------------------
\subsection{历史}

%%%------------------------------------------------------------------------------------------------------------
%%% 为什么要谈神经网络
\begin{frame}{为什么要谈神经网络}
\begin{itemize}
\item 近些年\textbf{深度学习（Deep Learning）}体现了巨大的潜力
    \begin{itemize}
    \item 席卷了包括机器翻译在内的很多NLP任务
    \item 已经成为了NLP中方法的新范式
    \item 衍生出\textbf{神经机器翻译}等新一代方法（下一章内容）
    \end{itemize}
\vspace{0.2em}
\begin{center}
\includegraphics[scale=0.45]{./Figures/deeplearning.jpg}
\end{center}
\vspace{0.5em}

\item<2-> \textbf{人工神经网络（Artificial Neural Network）}是深度学习的实践基础

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 简单的历史
\begin{frame}{神经网络和深度学习的概念（1940s-1970s）}
\begin{itemize}
\item \textbf{神经网络}最早出现在控制论中（Cybernetics），随后更多地在连接主义（Connectionism）中被提及
    \begin{itemize}
    \item \textbf{最初的想法}：模拟大脑的生物学习机制进行计算机建模
    \item<2-> 比如使用线性加权函数来描述输入$\textbf{x}$和结果$\textbf{y}$之间的联系
    \vspace{-0.5em}
    \begin{displaymath}
    f(\textbf{x},\textbf{w})=x_1 \cdot w_1 + ... + x_n \cdot w_n
    \end{displaymath}\\
    \vspace{-0.5em}
    其中$\textbf{w}$是权重。这类模型也影响了随机梯度下降等现在机器学习方法的发展。
    \item<3-> 这类方法的局限也很明显，无法描述非线性问题，如著名的异或函数（XOR）学习问题
    \end{itemize}

\end{itemize}

\vspace{-0.5em}
\begin{center}
\includegraphics[scale=0.21]{./Figures/concept-history.jpg}\\
\scriptsize{图片引自《Deep Learning》}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 深度学习的发展
\begin{frame}{神经网络和深度学习的发展（1980s-1990s）}
\begin{itemize}
\item 现在，生物学属性已经不是神经网络的唯一灵感来源。深度学习也进入了新的发展阶段。两类思潮影响巨大：
    \begin{itemize}
    \item<2-> \textbf{连接主义（Connectionism）}。在认知学科中，早期的符号主义（Symbolicism）很难解释大脑如何使用神经元进行推理。连接主义的核心思想是：“大量简单的计算单元连接到一起可以实现智能行为”。\\
        这也推动了反向传播等训练多层神经网络方法的应用，并发展了包括长短时记忆模型在内的经典建模方法。
    \item<3-> \textbf{分布式表示（Distributed representation）}：一个复杂系统的任何部分的输入都应该是多个特征共同表示的结果。比如，一个单词并非一个词条，而是由成百上千个特征共同描述出来，而每个特征都描述了这个词的"某个"方面。
    \end{itemize}
\item<4-> \alert{遗憾的是}，上世纪90年代后期，在很多应用中人们对神经网络方法期望过高，但是结果并没有达到预期。特别是，核方法、图模型等机器学习方法取得了很好的效果，神经网络研究进入又一次低谷。

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 深度学习的第三次浪潮
\begin{frame}{第三次浪潮（2000s-now）}
\begin{itemize}
\item \textbf{深度学习的爆发}源于2006年Hinton等人成功训练了一个深度信念网络（deep belief network）。之后，深度学习的浪潮逐步席卷了机器学习及人工智能应用领域，延续至今。现代深度学习的成功有三方面原因：
    \begin{enumerate}
    \item \textbf{模型和算法}的完善与改进
    \item \textbf{并行计算能力}的提升使大规模实践变为了可能
    \item 以Hinton等人为代表的学者的\textbf{坚持与持续投入}
    \end{enumerate}
\item<2-> \textbf{从应用的角度}，数据量的快速提升和模型容量的增加也为深度学习的成功提供了条件
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\scriptsize{
\begin{semilogyaxis}[
    width=.95\textwidth,
    height=.38\textwidth,
    yticklabel style={/pgf/number format/precision=1,/pgf/number format/fixed zerofill},
    xticklabel style={/pgf/number format/1000 sep=},
    xlabel style={yshift=0.5em},
    xlabel={\footnotesize{Year}},ylabel={\footnotesize{\# of sents.}},
    ymin=1,ymax=1000000000000,
    xmin=1999,xmax=2020,xtick={2000,2005,2010,2015,2020},
    legend style={yshift=-5em,xshift=0em,legend cell align=left,legend plot pos=right}
]

\addplot[purple,mark=square,mark=star,very thick] coordinates {(2001,10000) (2005,2000000) (2008,8000000) (2009,9000000) (2011,10000000) (2012,12000000) (2014,20000000) (2016,30000000) (2018,40000000) };
\addlegendentry{\tiny{Bi-text used in MT papers}\ \ \ \ \ \ \ \ \ \ }
\only<3->{
\addplot[ublue,mark=otimes*,very thick] coordinates {(2005,10000000) (2008,100000000) (2012,3000000000) (2016,5000000000) (2019,10000000000) };
\addlegendentry{\tiny{Bi-text used in practical systems}}
}

\end{semilogyaxis}
}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{深度学习的优势}

%%%------------------------------------------------------------------------------------------------------------
%%% 端到端学习
\begin{frame}{端到端学习}
\begin{itemize}
\item 深度神经网络给我们提供了一种机制，可以直接从学习输入到输出的关系，称之为\alert{端到端学习}
    \begin{itemize}
    \item<2-> \textbf{基于特征工程的方法}：需要大量人工定义的特征，这个过程往往会带来对问题的隐含假设
    \item<3-> \textbf{基于端到端学习的方法}：没有人工定义的特征，整个过程完全由神经网络建模
    \end{itemize}
\end{itemize}
\vspace{-0.5em}
\begin{center}
\visible<2->{
\includegraphics[scale=0.31]{./Figures/end2end-learning-1.jpg}\\
}
\visible<3->{
\Large{\textbf{VS.}}\\
\vspace{0.3em}
\includegraphics[scale=0.31]{./Figures/end2end-learning-2.jpg}\\
}
\end{center}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 深度学习的表现
\begin{frame}{深度学习的表现 - 以语言建模为例}
\begin{itemize}
\item \textbf{比如}，在语言建模（LM）任务上，基于神经网络和深度学习的方法体现了巨大优势，在PTB数据上PPL值已经得到惊人的下降（PPL越低越好）
	\begin{itemize}
	\item 传统$n$元语法模型面临数据稀疏等问题
	\item<2-> 神经语言模型可以更好地描述序列生成问题
	\end{itemize}
\end{itemize}
\begin{tabular}{l | l | l | r}
模型 & 作者 & 年份 & PPL  \\ \hline
3-gram LM & Brown et al. & 1992 & 178.0 \pause \\ \hline
Feed-forward Neural LM & Bengio et al. & 2003 & 162.2 \\
Recurrent NN-based LM & Mikolov et al. & 2010 & 124.7 \\
Recurrent NN-LDA & Mikolov et al. & 2012 & 92.0 \\
LSTM & Zaremba et al. & 2014 & 78.4 \\
RHN & Zilly et al. & 2016 & 65.4 \\
AWD-LSTM & Merity et al. & 2018 & 58.8 \\
GPT-2 (Transformer) & Radford et al. & 2019 & \alert{35.7}
\end{tabular}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\section{神经网络基础}

%%%------------------------------------------------------------------------------------------------------------
\subsection{简单的例子}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经元
\begin{frame}{神经网络的基本单元 - 神经元}
\begin{itemize}
\item 生物学上，神经元是神经系统的基本组成单元。很多人想象的神经网络应该是这样的\\
\begin{center}
\includegraphics[scale=0.25]{./Figures/neuron-real.jpg}\\
\end{center}
\item<2-> 但我们这里说的是\textbf{人工神经元}，实际上是这样的 :)
    \begin{itemize}
    \item 输入$\textbf{x}$经过$\textbf{w}$进行线性变化，之后加上偏移$\textbf{b}$，在经过激活函数$f$，最后得到$\textbf{y}$ - 啥东东？？？
    \end{itemize}
{\Large
\begin{displaymath}
\textbf{y} = f(\textbf{x} \cdot \textbf{w} + \textbf{b})
\end{displaymath}
}
\\
\vspace{-0.5em}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机
\begin{frame}{最简单的人工神经元模型 - 感知机（Perceptron）}
\begin{itemize}
\item 感知机是人工神经元的一种实例。在上世纪50-60年代被提出后，对神经网络研究产生了深远影响。
    \begin{itemize}
    \item<2-> \textbf{输入}是若干个二值变量，$x_i=0$ or $1$
    \item<3-> 每一个输入变量对应一个\textbf{权重}$w_i$（实数）
    \item<4-> \textbf{输出}也是一个二值结果，$y=0$ or $1$。 判断的依据是，输入和加权和是否大于（或者小于）一个阈值$\sigma$：
    \begin{displaymath}
    y = \left\{ \begin{array}{ll}
    0 & \sum_i {w_i \cdot x_i} < \sigma \\
    1 & \sum_i {w_i \cdot x_i} \ge \sigma
    \end{array} \right.
    \end{displaymath}
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {\Large{$x_1$}};
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {\Large{$x_0$}};
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {\Large{$x_2$}};
\node [anchor=west] (y) at ([xshift=6em]neuron.east) {\Large{$y$}};

\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above] {$w_0$};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above] {$w_1$};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above] {$w_2$};
\draw [->,thick] (neuron.east) -- (y.west);

\visible<2>{
\draw [->,thick,red] (x0.east) -- (neuron.150) node [pos=0.5,above] {\black{$w_0$}};
\draw [->,thick,red] (x1.east) -- (neuron.180) node [pos=0.5,above] {\black{$w_1$}};
\draw [->,thick,red] (x2.east) -- (neuron.210) node [pos=0.5,above] {\black{$w_2$}};
}

\visible<3>{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above] {\red{$w_0$}};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above] {\red{$w_1$}};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above] {\red{$w_2$}};
}

\visible<4->{
\node [anchor=center] (neuronmath) at (neuron.center) {\red{\small{$\sum \ge \sigma$}}};
}

\visible<5->{
\node [anchor=south] (prediction) at ([xshift=-2em,yshift=1em]y.north west) {\footnotesize{\red{$x_0 w_0 + x_1 w_1 + x_2 w_2 \ge \sigma$}}};
\draw [->,thick,red] (neuron.east) -- (y.west);
\node [anchor=west] (yvalue) at ([yshift=0.2em]y.east) {\Large{$=1$}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子
\begin{frame}{一个例子}
\begin{itemize}
\item 一个非常简单的例子。比如，有一个音乐会，你正在纠结是否去参加，有三个因素会影响你的决定
    \begin{itemize}
    \item $x_0$：剧场是否离你足够近？
    \item $x_1$：票价是否低于300元？
    \item $x_2$：女朋友是否喜欢音乐会？
    \end{itemize}
\item<2-> 如何决定？比如，女朋友很希望和你一起，但是剧场很远而且票价500元。如果这些因素对你的决策都是同等重要的，那么会有一个综合得分：
    \begin{displaymath}
    x_0 \cdot w_0 + x_1 \cdot w_1 + x_2 \cdot w_2 = 0 \cdot 1 + 0 \cdot 1 + 1 \cdot 1 = 1
    \end{displaymath}
\item<3-> 如果你不是十分纠结，能够接受不完美的事情，你可能会有$\sigma=1$，于是
    \begin{displaymath}
    \sum_i x_i \cdot w_i \ge \sigma
    \end{displaymath}
    \textbf{那么}，你会去参加音乐会
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子：权重
\begin{frame}{一个例子 - 权重}
\begin{itemize}
\item 可以看出，实际上这个决策过程本质上就是一个感知机
\item<2-> 但是，人并不是完美的，往往对有些事情会更在意一些。如果你是\textbf{守财奴}，因此会对票价看的更重一些，这时你会用不均匀的权重计算每个因素的影响，比如：$w_0=0.5$，$w_1=2$，$w_2=0.5$
\item<3-> 女友很希望和你一起，但是剧场很远而且票价500元，会导致你\alert{选择不去}看音乐会（女朋友都不要了，咋整）
    \begin{displaymath}
    \sum_i x_i \cdot w_i = 0 \cdot 0.5 + 0 \cdot 2 + 1 \cdot 0.5 = 0.5 < \sigma = 1
    \end{displaymath}
\end{itemize}

\vspace{-1.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:票价够低？};
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:距离够近？};
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢？};
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去？还是不去？};

\visible<1>{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=1$}};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\small{$w_1=1$}};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=1$}};
}
\draw [->,thick] (neuron.east) -- (y.west);

\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};

\visible<2->{
\draw [->,thin,red] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=.5$}};
\draw [->,line width=0.8mm,red] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=2$}}};
\draw [->,thin,red] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=.5$}};
}

\visible<3->{
\node [anchor=south] (ylabel) at (y.north) {\red{\textbf{不去了！}}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子：输入形式
\begin{frame}{一个例子 - 输入形式}

\begin{itemize}
\item 在遭受了女友一万点伤害之后，你意识到决策不应该只考虑非0即1的因素，应该把``程度''考虑进来：
    \begin{itemize}
    \item $x_0$：10/距离
    \item $x_1$：150/票价
    \item $x_2$：女朋友是否喜欢？(这条不敢改)
    \end{itemize}
\item<2-> 新模型中，$x_0$和$x_1$是连续变量，$x_2$是一个离散变量
\end{itemize}

\visible<2->{
\begin{tikzpicture}

\begin{scope}
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.6:2,samples=100] plot (\x,{ 1/\x - 0.2});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_0$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\scriptsize{距离(km)}};
\end{scope}

\begin{scope}[xshift=9em]
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.4:2,samples=100] plot (\x,{ 0.5/\x});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_1$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\scriptsize{票价(元)}};
\end{scope}

\begin{scope}[xshift=18em]
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_2$}};
\node [anchor=south, fill=ublue, minimum width=1.5em, minimum height=0.1em, inner sep=0] (histogram1) at (1.5em, 0) {};
\node [anchor=south, fill=ublue, minimum width=1.5em, minimum height=3em, inner sep=0] (histogram2) at (4.0em, 0) {};
\node [anchor=north] (hlabel1) at (histogram1.south) {\tiny{女友不去}};
\node [anchor=north] (hlabel2) at (histogram2.south) {\tiny{女友去}};
\end{scope}

\end{tikzpicture}
}

\begin{itemize}
\item<3-> 女朋友很希望和你一起，但是剧场有20km远而且票价500元。于是有\ $x_0 = 10/20 = 0.5$，$x_1=150/500 = 0.3$, $x_2=1$。综合来看$\sum_i x_i \cdot w_i \ge \sigma$，还是{\color{red} 去听音乐会} :)
    \begin{displaymath}
    \sum_i x_i \cdot w_i = 0.5 \cdot 0.5 + 0.3 \cdot 2 + 1 \cdot 0.5 = 1.35 \ge \sigma = 1
    \end{displaymath}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 学习
\begin{frame}{一个例子 - 学习}

\begin{itemize}
\item 一次成功的音乐会之后，你似乎掌握了真理：只要女朋友开心就好，为何不把这个因素的权重调大。最简单的方式是把$w_0$，$w_1$的权重都置0，同时令$w_3 > 0$
\item<3-> 很快又有一场音乐会，距你1000公里，票价（不含路费）3000元，当然你女友是一直是喜欢音乐会的。根据新的决策模型，你义无反顾地\alert{决定去听}这场音乐会
\item<4-> \textbf{之后}，你女朋友又给了你1万点伤害，痛啊！！！
    \begin{itemize}
    \item \alert{结果你发现}：女友既要浪漫，同时也爱财
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:便宜程度\ \ \ \ };
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:远近程度\ \ \ \ };
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢？};
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去？还是不去？};

\draw [->,thick] (neuron.east) -- (y.west);

\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};

\visible<1>{
\draw [->,thin] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=.5$}};
\draw [->,line width=0.8mm] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=2$}}};
\draw [->,thin] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=.5$}};
}

\visible<2->{
\draw [->,dotted] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {\small{$w_0=0$}};
\draw [->,dotted] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {\textbf{\small{$w_1=0$}}};
\draw [->,line width=1mm] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {\small{$w_2=10$}};
}


\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 学习(cont)
\begin{frame}{一个例子 - 权重学习}
\begin{itemize}
\item \textbf{痛定思痛}，你发现每个因素的权重需要准确地设置才能达到最好的决策效果
    \begin{itemize}
    \item 如何确定最好的权重？
    \end{itemize}
\item<2-> \textbf{当然}，你是一个勇于实践的人
    \begin{itemize}
    \item 方法很简单：不断地尝试，根据结构不断地调整权重
    \item<10-> 在进行了很多次实验后，发现了相对好的一组权重
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}

\begin{scope}[scale=0.6]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.6:2,samples=100] plot (\x,{ 1/\x - 0.2});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_0$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\tiny{距离(km)}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_0$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=1.8em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w0) at (1.25,-1.5) {};}

\end{scope}

\begin{scope}[scale=0.6,xshift=12em]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\draw [-,very thick,ublue,domain=0.4:2,samples=100] plot (\x,{ 0.5/\x});
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_1$}};
\node [anchor=north] (xlabel) at (5em, 0em) {\tiny{票价(元)}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_1$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=1.5em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=0.1em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=1.0em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=0.3em,minimum width=1.5em] (w1) at (1.25,-1.5) {};}

\end{scope}

\begin{scope}[scale=0.6,xshift=24em]
\visible<3->{
\draw [->,thick] (0,0) -- (2.5,0);
\draw [->,thick] (0,0) -- (0, 1.5);
\node [anchor=east] (ylabel) at (0, 3.2em) {\footnotesize{$x_2$}};
\node [anchor=south, fill=ublue, minimum width=0.8em, minimum height=0.1em, inner sep=0] (histogram1) at (1.5em, 0) {};
\node [anchor=south, fill=ublue, minimum width=0.8em, minimum height=2em, inner sep=0] (histogram2) at (4.0em, 0) {};
\node [anchor=north,align=left] (hlabel1) at (histogram1.south) {\tiny{女友no}};
\node [anchor=north,align=left] (hlabel2) at ([xshift=0.5em]histogram2.south) {\tiny{女友yes}};
}

\visible<4->{
\draw [-,thick] (0.25,-1.5) -- (2.25,-1.5);
\node [anchor=east] (wlabel) at (0.25,-1.5) {\footnotesize{$w_2$}};
}

\visible<5>{\node [anchor=north,fill=ugreen,minimum height=0.5em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<6>{\node [anchor=north,fill=ugreen,minimum height=1.2em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<7>{\node [anchor=north,fill=ugreen,minimum height=0.8em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<8>{\node [anchor=north,fill=ugreen,minimum height=1.2em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<9>{\node [anchor=north,fill=ugreen,minimum height=1.5em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}
\visible<10->{\node [anchor=north,fill=ugreen,minimum height=1.3em,minimum width=1.5em] (w2) at (1.25,-1.5) {};}

\end{scope}

\end{tikzpicture}
\end{center}

}

\visible<5->{
\begin{center}
\begin{tabular}{c<{\onslide<5->}c<{\onslide<6->}c<{\onslide<7->}c<{\onslide<8->}c<{\onslide<9->}c<{\onslide<10->}c<{\onslide}}
实验 & 1 & 2 & 3 & 4 & ... & 10k \\
结果 & 失败 & 成功 & 失败 & 失败 & ... & 成功
\end{tabular}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 感知机 - 一个例子 - 总结
\begin{frame}{一个例子 - 总结}
\begin{itemize}
\item 即便对于一个简单问题，如何设计一种合理方法的准确的进行决策并不简单。在上面这个模型中，还有一些\alert{问题}需要回答
    \begin{itemize}
    \item<2-> 对问题建模，即：定义输入$\{x_i\}$的形式
    \item<3-> 设计有效的决策模型，即：定义$y$
    \item<4-> 决定模型所涉及的参数（如权重$\{w_i\}$）的最优值
    \end{itemize}
\end{itemize}

\vspace{-2em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,circle,draw,ublue,very thick,minimum size=3.5em,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] (neuron) at (0,0) {};

\visible<2->{
\node [anchor=east] (x1) at ([xshift=-6em]neuron.west) {$x_1$:便宜程度\ \ \ \ };
\node [anchor=center] (x0) at ([yshift=3em]x1.center) {$x_0$:远近程度\ \ \ \ };
\node [anchor=center] (x2) at ([yshift=-3em]x1.center) {$x_2$:女友喜欢？};
}

\visible<3->{
\node [anchor=west] (y) at ([xshift=2em]neuron.east) {$y$:去？还是不去？};
\node [anchor=center] (neuronmath) at (neuron.center) {\small{$\sum \ge \sigma$}};
}

\draw [->,thick] (neuron.east) -- (y.west);

\draw [->,thick] (x0.east) -- (neuron.150);
\draw [->,thick] (x1.east) -- (neuron.180);
\draw [->,thick] (x2.east) -- (neuron.210);

\visible<4->{
\draw [->,thick] (x0.east) -- (neuron.150) node [pos=0.5,above,yshift=0.2em] {$w_0$};
\draw [->,thick] (x1.east) -- (neuron.180) node [pos=0.5,above,yshift=-0.1em] {$w_1$};
\draw [->,thick] (x2.east) -- (neuron.210) node [pos=0.5,above,yshift=0.1em] {$w_2$};
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-0.5em}

\begin{itemize}
\item<5-> \textbf{当然}，后面的内容会涉及上面的问题，而且不止这些 :)
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{人工神经元}

%%%------------------------------------------------------------------------------------------------------------
%%% outline
\begin{frame}{入门人工神经网络(深度学习)的三个基本问题}

\vspace{1em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{1. 人工神经网络的基本单元是什么,}

\vspace{0.4em}
\textbf{\hspace{0.9em} 如何组合出更强大的模型？}
}
\end{tcolorbox}

\vspace{0.5em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{2. 人工神经网络的数学描述是什么,}

\vspace{0.4em}
\textbf{\hspace{0.9em} 如何编程实现这种数学模型？}
}
\end{tcolorbox}

\vspace{0.5em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{3. 如何对模型中的参数进行学习,}

\vspace{0.4em}
\textbf{\hspace{0.9em} 之后使用学习到的模型进行推断？}
}
\end{tcolorbox}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% outline: problem 1
\begin{frame}{首先}

\vspace{6em}

\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{人工神经网络的基本单元是什么,}

\vspace{0.4em}
\textbf{如何组合出更强大的模型？}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
\node [fill=blue!10] (label) at (0,0) {\Large{$\textbf{y} = ?(\textbf{x})$ }};
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 线性代数基础
\begin{frame}{预热 - 线性代数知识}
\begin{itemize}
\item \textbf{矩阵}：我们用$a$表示一个标量(一个数)，用粗体$\textbf{a}$表示一个矩阵(或向量)，其中$a_{ij}$表示$\textbf{a}$第$i$行、第$j$列的元素\\
    \begin{displaymath}
    a = 5 \hspace{3em} \textbf{a} = \begin{pmatrix} a_{11} & a_{12} \\ a_{21} & a_{22} \end{pmatrix} = \begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}
    \end{displaymath}
\item \textbf{向量}：一种特殊的矩阵，只有一行或者一列，这里默认使用行向量，比如$\textbf{a} = (a_1,a_2,a_3) = (10, 20, 30)$，$\textbf{a}$对应的列向量记为$\textbf{a}^T$
\item<2-> \textbf{代数运算}：矩阵可以按位进行+、-等代数运算，对于$\textbf{a} = \begin{pmatrix} 1 & 2 \\ 3 & 4 \end{pmatrix}$，$\textbf{b} = \begin{pmatrix} 1 & 1 \\ 1 & 1 \end{pmatrix}$，有$\textbf{a} + \textbf{b} = \begin{pmatrix} 2 & 3 \\ 4 & 5 \end{pmatrix}$
\item<3-> \textbf{矩阵的微分}：按位进行，对于矩阵$\textbf{c}$和标量$x$有
    \begin{displaymath}
    \frac{\partial \textbf{c}}{\partial x} = \begin{pmatrix} \frac{\partial c_{11}}{\partial x} & \frac{\partial c_{12}}{\partial x} \\ \frac{\partial c_{21}}{\partial x} & \frac{\partial c_{22}}{\partial x} \end{pmatrix} \hspace{2em} \frac{\partial x}{\partial \textbf{c}} = \begin{pmatrix} \frac{\partial x}{\partial c_{11}} & \frac{\partial x}{\partial c_{12}} \\ \frac{\partial x}{\partial c_{21}} & \frac{\partial x}{\partial c_{22}} \end{pmatrix}
    \end{displaymath}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 线性代数基础
\begin{frame}{预热 - 线性代数知识(续)}
\begin{itemize}


\item \textbf{矩阵的乘法}：对于$\textbf{a} \in \mathbb{R}^{n \times k}$和$\textbf{b} \in \mathbb{R}^{k \times m}$，用$\textbf{c} = \textbf{a} \textbf{b} \in \mathbb{R}^{n \times m}$表示\textbf{a}和\textbf{b}的矩阵乘法，其中
    \begin{displaymath}
    c_{pq} = \sum_{i = 1}^k a_{pi} b_{iq}
    \end{displaymath}
    对于方程$\left\{ \begin{array}{l} 5x_{1} + 2x_{2} = y_{1} \\ 3x_{1} + x_{2} = y_{2}\end{array} \right.$，可以表示为$\textbf{a} \textbf{x}^T = \textbf{y}^T$ 其中$\textbf{a}=\begin{pmatrix} 5 & 2 \\ 3 & 1 \end{pmatrix}$，$\textbf{x}^T =\begin{pmatrix} x_1 \\ x_2 \end{pmatrix}$，$\textbf{y}^T =\begin{pmatrix} y_1 \\ y_2 \end{pmatrix}$
\item<2-> \textbf{其它}
    \begin{itemize}
    \item \textbf{单位矩阵}：方阵$\textbf{I}$，$I_{ij} = 1$当且仅当$i=j$，否则$I_{ij} = 0$
    \item \textbf{转置}：$\textbf{a}$的转置记为$\textbf{a}^T$，有$a^T_{ji}=a_{ij}$
    \item \textbf{逆矩阵}：方阵$\textbf{a}$的逆矩阵记为$\textbf{a}^{-1}$，有$\textbf{a} \textbf{a}^{-1} = \textbf{a}^{-1} \textbf{a} = \textbf{I}$
    \item \textbf{向量(矩阵)的范数}：$||\textbf{a}||_p = \big( \sum_i |a_i|^p \big)^{\frac{1}{p}}$
    \end{itemize}

\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 人工神经元的函数形式
\begin{frame}{人工神经元即一个函数}

\begin{itemize}
\item 神经元：
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\LARGE{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\LARGE{$=$}};
\node [anchor=west] (func) at (eq.east) {\LARGE{$f$}};
\node [anchor=west] (brace01) at (func.east) {\LARGE{$($}};
\node [anchor=west] (x) at (brace01.east) {\LARGE{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\LARGE{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\LARGE{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\LARGE{$+$}};
\node [anchor=west] (b) at (plus.east) {\LARGE{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\LARGE{$)$}};

\visible<2->{
\node [anchor=center,fill=yellow!30] (x2) at (x) {\LARGE{$\textbf{x}$}};
\node [anchor=south] (xlabel) at ([yshift=1.5em]x.north) {输入};
\draw [<-] ([yshift=0.2em]x2.north) -- (xlabel.south);
}

\visible<3->{
\node [anchor=center,fill=green!20] (w2) at (w) {\LARGE{$\textbf{w}$}};
\node [anchor=north] (wlabel) at ([yshift=-1.5em]w.south) {参数(权重)};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);
}

\visible<4->{
\node [anchor=center,fill=purple!20] (b2) at (b) {\LARGE{$\textbf{b}$}};
\node [anchor=south] (blabel) at ([yshift=1.3em]b.north) {偏移};
\draw [<-] ([yshift=0.2em]b2.north) -- (blabel.south);
}

\visible<5->{
\node [anchor=center,fill=blue!20] (func2) at (func) {\LARGE{$f$}};
\node [anchor=north] (funclabel) at ([yshift=-1.1em]func.south) {激活函数};
\draw [<-] ([yshift=-0.2em]func2.south) -- (funclabel.north);
}

\visible<6->{
\node [anchor=center,fill=red!20] (y2) at (y) {\LARGE{$\textbf{y}$}};
\node [anchor=south] (ylabel) at ([yshift=1.3em]y.north) {输出};
\draw [<-] ([yshift=0.2em]y2.north) -- (ylabel.south);
}

\end{tikzpicture}
\end{center}

\vspace{-1em}
\begin{itemize}
\item<7-> 以感知机为例
	\begin{itemize}
	\item 输入：$\textbf{x}=(x_0,...,x_n)$
	\item 权重：$\textbf{w}=(w_0,...,w_n)$
	\item 偏移：$\textbf{b} = (-\sigma)$
	\item 激活函数：$f(z)=1$ 当$z \ge 0$, 其它情况$f(z)=0$
	\item 输出：$\textbf{y}=f(\textbf{x} \cdot \textbf{w} - \sigma)$
	\end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 层的概念
\begin{frame}{``层"的概念}
\begin{itemize}
\item 对于一个问题（相同输入），可能会有多个输出，这时可以把\alert{多个相同的神经元并列起来}，构成一\alert{``层"}
    \begin{itemize}
    \item 比如，天气预报需要同时预测湿度和温度
    \end{itemize}
\end{itemize}

\vspace{-2em}

\begin{center}
\begin{tikzpicture}
\begin{scope}

\tikzstyle{neuronnode} = [minimum size=1.5em,circle,draw,ublue,very thick,fill=white,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

\node [anchor=center,neuronnode] (neuron00) at (0,0) {};
\visible<2->{
\node [anchor=center,neuronnode] (neuron01) at ([yshift=-3em]neuron00) {};
}
\visible<3->{
\node [anchor=center,neuronnode] (neuron02) at ([yshift=-3em]neuron01) {};
}

\node [anchor=east] (x0) at ([xshift=-6em]neuron00.west) {$x_0$};
\node [anchor=east] (x1) at ([xshift=-6em]neuron01.west) {$x_1$};
\node [anchor=east] (x2) at ([xshift=-6em]neuron02.west) {$b$};

\node [anchor=west] (y0) at ([xshift=4em]neuron00.east) {$y_0$};

\draw [->] (x0.east) -- (neuron00.180) node [pos=0.1,above] {\tiny{$w_{00}$}};
\draw [->] (x1.east) -- (neuron00.200) node [pos=0.1,above] {\tiny{$w_{10}$}};
\draw [->] (x2.east) -- (neuron00.220) node [pos=0.05,above,yshift=0.3em] {\tiny{$b_{0}$}};
\draw [->] (neuron00.east) -- (y0.west);

\visible<2->{
\node [anchor=west] (y1) at ([xshift=4em]neuron01.east) {$y_1$};
\draw [->] (x0.east) -- (neuron01.160) node [pos=0.4,above] {\tiny{$w_{01}$}};
\draw [->] (x1.east) -- (neuron01.180) node [pos=0.35,above,yshift=-0.2em] {\tiny{$w_{11}$}};
\draw [->] (x2.east) -- (neuron01.200) node [pos=0.3,below,yshift=0.2em] {\tiny{$b_{1}$}};
\draw [->] (neuron01.east) -- (y1.west);
}

\visible<3->{
\node [anchor=west] (y2) at ([xshift=4em]neuron02.east) {$y_2$};
\draw [->] (x0.east) -- (neuron02.140) node [pos=0.1,below,yshift=-0.2em] {\tiny{$w_{02}$}};
\draw [->] (x1.east) -- (neuron02.160) node [pos=0.1,below] {\tiny{$w_{12}$}};
\draw [->] (x2.east) -- (neuron02.180) node [pos=0.3,below] {\tiny{$b_{2}$}};
\draw [->] (neuron02.east) -- (y2.west);
}

\visible<4->{
\node [anchor=east,align=left] (inputlabel) at ([xshift=-0.1em]x1.west) {输入向量:\\\small{$\textbf{x}=(x_0,x_1)$}};
}
\visible<5->{
\node [anchor=west,align=left] (outputlabel) at ([xshift=0.1em]y1.east) {输出向量:\\\small{$\textbf{y}=(y_0,y_1,y_2)$}};
}

\begin{pgfonlayer}{background}
\visible<6->{
\node [rectangle,inner sep=0.4em,fill=red!20] [fit = (neuron00) (neuron01) (neuron02)] (layer) {};
\node [anchor=south] (layerlabel) at ([yshift=0.2em]layer.north) {一层神经元};
}

\visible<4->{
\node [rectangle,inner sep=0.1em,fill=ugreen!20] [fit = (x0) (x1)] (inputshadow) {};
}
\visible<5->{
\node [rectangle,inner sep=0.1em,fill=blue!20] [fit = (y0) (y1) (y2)] (outputshadow) {};
}
\end{pgfonlayer}

\visible<7->{
\node [anchor=north west] (wlabel) at ([yshift=-1em,xshift=-7em]x2.south) {参数(矩阵):$\textbf{w} = \Big( \begin{array}{lll} w_{00} & w_{01} & w_{02} \\ w_{10} & w_{11} & w_{12} \end{array} \Big)$};
}
\visible<8->{
\node [anchor=west] (blabel) at (wlabel.east) {参数(向量):$\textbf{b} = (b_0, b_1, b_2)$};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经网络的作用
\begin{frame}{神经网络：线性变换 + 激活函数}
\begin{itemize}
\item 对于向量$\textbf{x} \in \mathbb{R}^m$，一层神经网络首先把他经过\textbf{\alert{线性变换}}映射到$\mathbb{R}^m$，之后经过\textbf{{\color{blue}激活函数}}变换成$\textbf{y} \in \mathbb{R}^n$
\end{itemize}

\vspace{1em}

\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\Large{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\Large{$=$}};
\node [anchor=west] (func) at (eq.east) {\Large{$f$}};
\node [anchor=west] (brace01) at (func.east) {\Large{$($}};
\node [anchor=west] (x) at (brace01.east) {\Large{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\Large{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\Large{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\Large{$+$}};
\node [anchor=west] (b) at (plus.east) {\Large{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\Large{$)$}};

\node [anchor=center,fill=blue!20] (func2) at (func) {\LARGE{$f$}};
\node [anchor=north] (funclabel) at ([yshift=-1.1em]func.south) {\blue{激活函数}};
\draw [<-] ([yshift=-0.2em]func2.south) -- (funclabel.north);

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (x) (w) (b)] (linear) {};
\node [anchor=north] (linearlabel) at ([yshift=-1.1em]linear.south) {\alert{线性变换}};
\draw [<-] ([yshift=-0.2em]linear.south) -- (linearlabel.north);
\end{pgfonlayer}

\end{tikzpicture}

\begin{figure}[htp!]

\includegraphics[scale=0.24]{./Figures/wf.png}
% \begin{tikzpicture}
%     \node [rectangle,inner sep=0.2em,fill=red!20] [fit = (x) (w) (b)] (linear) {};
%     \node [anchor=north] (linearlabel) at ([yshift=-1.1em]linear.south) {\alert{线性变换}}
\end{figure}
\tikz {\node () at (0,0) {}; \node () at (0,10) {};}
\end{center}
\end{frame}


%%%------------------------------------------------------------------------------------------------------------
%%% 线性变换
\begin{frame}{线性变换}
\begin{itemize}
\item 对于线性空间$V$，任意$\textbf{a}$，$\textbf{b} \in V$和数域中的任意$\alpha$，线性变换$T(\cdot)$需满足
\begin{eqnarray}
T(\textbf{a} + \textbf{b}) & = & T(\textbf{a}) + T(\textbf{b}) \nonumber \\
T(\alpha \textbf{a}) & = & \alpha T(\textbf{a}) \nonumber
\end{eqnarray}
\item<2-> 线性变换的一种几何解释：
\end{itemize}

\vspace{-0.5em}
\visible<2->{
\begin{center}
\begin{tikzpicture}

\node [anchor=west] (x) at (0,0) {\Large{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\Large{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\Large{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\Large{$+$}};
\node [anchor=west] (b) at (plus.east) {\Large{$\textbf{b}$}};

\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a1) at ([xshift=-6em,yshift=-4em]x.south) {};
\draw[->,thick] ([xshift=-2em,yshift=0em]a1.south) to ([xshift=3em,yshift=0em]a1.south);
\draw[->,thick] ([xshift=0em,yshift=-4em]a1.west) to ([xshift=0em,yshift=2em]a1.west);
\node[below] at ([xshift=0.5em,yshift=-1em]a1.west){0};
\node[below] at ([xshift=2em,yshift=-1em]a1.west){1};
\node[below] at ([xshift=-0.5em,yshift=2em]a1.west){1};
\node [anchor=west] (x) at ([xshift=-0.7em,yshift=1em]a1.south) {\Large{$\textbf{F}$}};

\visible<3->{
\node [anchor=center,fill=green!20] (w2) at (w) {\Large{$\textbf{w}$}};
\node [anchor=north,inner sep=1pt] (wlabel) at ([yshift=-0.7em]w.south) {\small{旋转(rotation)}};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);

\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a2) at ([xshift=10em,yshift=0em]a1.south) {};
\draw[->,thick] ([xshift=-2em,yshift=0em]a2.north) to ([xshift=3em,yshift=0em]a2.north);
\draw[->,thick] ([xshift=0em,yshift=-2em]a2.west) to ([xshift=0em,yshift=4em]a2.west);
\node[above] at ([xshift=0.5em,yshift=1em]a2.west){0};
\node[above] at ([xshift=2em,yshift=1em]a2.west){1};
\node[below] at ([xshift=-0.5em,yshift=0em]a2.west){-1};
\node [anchor=west] (x) at ([xshift=-3.5cm,yshift=2em]a2.north) {\scriptsize{
    $w=\begin{bmatrix}
    1&0&0\\
    0&-1&0\\
    0&0&1
    \end{bmatrix}$}
    };

\node [anchor=west,rotate = 180] (x) at ([xshift=0.7em,yshift=1em]a2.south) {\Large{$\textbf{F}$}};


\draw[-stealth, line width=2pt,dashed] ([xshift=4em,yshift=0em]a1.south) to ([xshift=-3em,yshift=0em]a2.north);
}

\visible<4->{
\node [anchor=center,fill=purple!20] (b2) at (b) {\Large{$\textbf{b}$}};
\node [anchor=west] (blabel) at ([xshift=1.5em]b2.east) {平移(shift)};
\draw [<-] ([xshift=0.2em]b2.east) -- (blabel.west);

\tikzstyle{neuron} = [rectangle,draw,thick,fill=red!30,red!35,minimum height=2em,minimum width=2em,font=\small]
\node[neuron,anchor=north] (a3) at ([xshift=11em,yshift=2.05em]a2.south) {};
\draw[->,thick] ([xshift=-3em,yshift=0em]a3.north) to ([xshift=2em,yshift=0em]a3.north);
\draw[->,thick] ([xshift=-1em,yshift=-2em]a3.west) to ([xshift=-1em,yshift=4em]a3.west);
\node[above] at ([xshift=-0.5em,yshift=1em]a3.west){0};
\node[above] at ([xshift=1em,yshift=1em]a3.west){1};
\node[left] at ([xshift=-0.75em,yshift=-0.5em]a3.west){-1};
\node [anchor=west,rotate = 180] (x) at ([xshift=0.7em,yshift=1em]a3.south) {\Large{$\textbf{F}$}};


\node [anchor=west] (x) at ([xshift=-4cm,yshift=2em]a3.north) {\scriptsize{
    $b=\begin{bmatrix}
    0.5&0&0\\
    0&0&0\\
    0&0&0
    \end{bmatrix}$}
    };
\draw[-stealth, line width=2pt,dashed] ([xshift=3em,yshift=1em]a2.east) to ([xshift=-3em,yshift=1em]a3.west);
}

\end{tikzpicture}
\end{center}
}


\end{frame}


%%%------------------------------------------------------------------------------------------------------------
%%% 线性变换：更复杂的实例
\begin{frame}[fragile]{线性变换（续）}
\begin{itemize}
\item 线性变换也适用于更加复杂的情况，这也给神经网络提供了拟合不同数据分布的能力
    \begin{itemize}
    \item 比如，我们可以把三维图形投影到二维平面上
    \item 再比如，我们也可以把二维平面上的图形映射到三维平面
    \end{itemize}
\end{itemize}
\begin{tiny}
$$
\begin{smallmatrix}  \underbrace{
    \left\{
        \begin{smallmatrix}
            \left[
            \begin{array}{cccc}
             1& 0 &0 \\
             0& 1 &0 \\
             0& 0 &1
            \end{array}
            \right ]
            \cdots
            \left[
            \begin{array}{cccc}
                1& 0 &0 \\
                0& 1 &0 \\
                0& 0 &1
            \end{array}
            \right]
        \end{smallmatrix}
        \right\}
     }\\5
\end{smallmatrix}
\times
\begin{smallmatrix}
\left[
    \begin{array}{cccc}
    1\\
    1\\
    1
    \end{array}
\right ]
\end{smallmatrix}
=
\begin{smallmatrix}  \underbrace{
    \left\{
        \begin{smallmatrix}
            \left[
            \begin{array}{cccc}
             1 \\
             1 \\
             1
            \end{array}
            \right ]
            \cdots
            \left[
            \begin{array}{cccc}
                1 \\
                1 \\
                1
            \end{array}
            \right]
        \end{smallmatrix}
        \right\}
     }\\5
\end{smallmatrix}
$$
\end{tiny}
%\vspace{1em}

\newcommand{\plane}[1]{
(-1.95, #1, 1.35) --
++(3.6, 0.6, 0.0) --
++(0.3, -1.8, -2.7) --
++(-3.6, -0.6, -0.0) --
cycle}
\newcommand{\nullspacepicture}{
% bottom part of the row space line
\draw (0,0,0) -- (0.3,-1.8,1.233);
% five planes
\draw[fill=gray!20]\plane{-0.2};
\draw[fill=gray!20]\plane{0.2};
\draw[fill=blue!70!gray]\plane{0.6};
\draw[fill=gray!20]\plane{1};
\draw[fill=gray!20]\plane{1.4};
% top part of the row space line
\draw (-.094,.562,-.385) -- (-0.3,1.8,-1.233);
}
\newcommand{\rangepicture}[1]{
% axes
\draw[help lines,->] (-2,0) -- (2,0);
\draw[help lines,->] (0,-2) -- (0,2);
% the line and circles
\draw (1,-2) -- (-1,2);
\draw[fill=#1] (0,0) circle (2.5pt);
\draw[fill=gray!50] (0.2,-0.4) circle (2.5pt);
\draw[fill=gray!50] (0.4,-0.8) circle (2.5pt);
\draw[fill=gray!50] (-0.2,0.4) circle (2.5pt);
\draw[fill=gray!50] (-0.4,0.8) circle (2.5pt);
}

\begin{tikzpicture}[scale=0.95]
\centering
\nullspacepicture
% the label
\node at (-2,1.8) {$\mathbb{R}^3$};
% arrow between diagrams
\path[->] (3,0) edge[bend left] node[above] {线性变换} (4.5,0);
\begin{scope}[xshift=7cm]
\rangepicture{blue!70!gray}
\node at (1.8,1.8) {$\mathbb{R}^2$};
\end{scope}
\end{tikzpicture}
\end{frame}


%%%------------------------------------------------------------------------------------------------------------
%%% 激活函数
\begin{frame}{激活函数}
\begin{itemize}
\item 激活函数更多地是为了解决实际问题中的\alert{非线性}变换
    \begin{itemize}
    \item 非线性部分提供了拟合任意函数的能力（稍后介绍）
    \end{itemize}
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\draw [line width=3pt,ublue,-](0,0) -- (-2.0,1);
\node [anchor=north] (linelabel) at (-1.0,-0.5) {\footnotesize{我是一根筷子}};
\end{scope}

\begin{scope}[xshift=10em]
\draw [line width=3pt,ublue,-,line cap=round](0,0) .. controls (-0.5,-0.25) and (-0.5,1).. (-1.3,0.3) .. controls (-2.3,-0.3) and (-1.1,1.8).. (-2.0,1);
\node [] at (-2,1) {\white{$\cdot$}};
\node [anchor=north] (linelabel) at (-1.0,-0.5) {\footnotesize{我是一只蚯蚓}};
\end{scope}
\end{tikzpicture}
\end{center}

\begin{itemize}
\item<2-> 简单的非线性函数
\end{itemize}

\vspace{-1em}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}[]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
\draw [-,very thick,ublue,domain=-1.2:1.2,samples=100] plot (\x,{0.5 * (\x -0.3)^2 + 0.2});
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = \frac{1}{2}  (x - 0.3)^2 + 0.2$}};
\node [anchor=south west] (flabel) at (func.north west) {\footnotesize{Quadratic:}};
\end{scope}

\begin{scope}[xshift=9.5em]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
\draw [-,very thick,ublue,domain=-1.2:1.2,samples=100] plot (\x, {0.5 * exp(\x)});
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = 0.5 \cdot  \exp(x)$}};
\node [anchor=south west] (flabel) at ([xshift=-1.8em]func.north west) {\footnotesize{Exponential:}};
\end{scope}

\begin{scope}[xshift=19em]
\draw [->,thick] (-1.5,0) -- (1.5,0);
\draw [->,thick] (0,-0.1) -- (0,1.5);
\draw [-,very thick,ublue,domain=-1.1:1.2,samples=100] plot (\x,{abs(\x -0.2) + 0.1});
\node [anchor=west] (ylabel) at (0,1.3) {$y$};
\node [anchor=north] (xlabel) at (1.3,0) {$x$};
\node [anchor=north] (func) at (0,-0.8) {\footnotesize{$y = |x - 0.3| + 0.1$}};
\node [anchor=south west] (flabel) at ([xshift=-0.4em]func.north west) {\footnotesize{Absolute:}};
\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 常用的激活函数
\begin{frame}{常用的激活函数}
    \begin{itemize}
    \item 好多好多，列举不全 ...
    \end{itemize}
    \vspace{-1em}
    \begin{figure}
    \subfigure[softplus]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {1.0,0.5}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1]plot(\x,{ln(1+(exp(\x))});
        \node[black,anchor=south] at (0,1.2) {\small $y = ln(1+e^x)$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[sigmoid]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \foreach \x in {-1,-0.5,0,0.5,1}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){
            \pgfmathparse{(\x)*5}
            \pgfmathresult};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red,domain=-1.2:1.2]plot(\x,{1/(1+(exp(-5*\x)))});
        \node[black,anchor=south] at (0,1.2) {\small $y = \frac{1}{1+e^{-x}}$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[tanh]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \draw[dashed](-1.2,-1)--(1.2,-1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{tanh(\x)});
        \node[black,anchor=south] at (0,1.2) {\small $y = \frac{e^{x}-e^{-x}}{e^{x}+e^{-x}}$};
        \end{tikzpicture}
    \end{minipage}
    }
    \end{figure}
    \vspace{-1em}
    \begin{figure}
    \subfigure[relu]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \draw[dashed](-1.2,-1)--(1.2,-1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{max(\x,0)});
        \node[black,anchor=south] at (0,1.2) {\small $y =\max (0, x)$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[gaussian]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \draw[dashed](-1.2,1)--(1.2,1);
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1.2:1.2]plot(\x,{exp(-1*((\x)^2))});
        \node[black,anchor=south] at (0,1.2) {\small $y =e^{-x^2}$};
        \end{tikzpicture}
    \end{minipage}%
    }
    \hfill
    \subfigure[identity]{
    \centering
    \begin{minipage}{.2\textwidth}
        \begin{tikzpicture}
        \draw[->](-1.2,0)--(1.2,0)node[left,below,font=\tiny]{$x$};
        \draw[->](0,-1.2)--(0,1.2)node[right,font=\tiny]{$y$};
        \foreach \x in {-1.0,-0.5,0.0,0.5,1.0}{\draw(\x,0)--(\x,0.05)node[below,outer sep=2pt,font=\tiny]at(\x,0){\x};}
        \foreach \y in {0.5,1.0}{\draw(0,\y)--(0.05,\y)node[left,outer sep=2pt,font=\tiny]at(0,\y){\y};}
        \draw[color=red ,domain=-1:1]plot(\x,\x);
        \node[black,anchor=south] at (0,1.2) {\small $y =x$};
        \end{tikzpicture}
    \end{minipage}
    }

    \end{figure}

\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\subsection{多层神经网络}

%%%------------------------------------------------------------------------------------------------------------
%%% 一层 -> 多层
\begin{frame}{更多的层}
\begin{itemize}
\item \textbf{单层神经网络}：线性变换 + 激活函数（非线性）
\item 我们可以重复上面的过程，构建\textbf{多层神经网络}
\end{itemize}

\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[]

\def\neuronsep{1.6}

\tikzstyle{neuronnode} = [minimum size=1.7em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

%%% layer 1
\foreach \n in {1,...,5}{
    \node [neuronnode] (neuron0\n) at (\n * \neuronsep,0) {\tiny{$f_1$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron0\n.east) -- (neuron0\n.west);
}

\foreach \n in {1,...,5}{
    \foreach \m in {1,...,5}{
        \draw [<-] (neuron0\m.south) -- ([yshift=-2em]neuron0\n.south);
    }
    \node [anchor=north] (x\n) at ([yshift=-2em]neuron0\n.south) {$x_\n$};
    \visible<1>{
    \draw [<-,thick] ([yshift=1.5em]neuron0\n.north) -- (neuron0\n.north);
    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron0\n.north) {$y_\n$};
    }
}

\node [anchor=west] (w1label) at ([xshift=-0.5em,yshift=0.5em]x5.north east) {$\textbf{w}_1$};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron05)] (layer01) {};
\end{pgfonlayer}

\node [anchor=west] (layer00label) at ([xshift=1.25em]x5.east) {\alert{输入层}};

\visible<2->{
\node [anchor=west] (layer01label) at ([xshift=1em]layer01.east) {第二层};
}
\visible<4->{
\node [anchor=west] (layer01label2) at (layer01label.east) {(\alert{隐层})};
}

%%% layer 2
\visible<2->{
\foreach \n in {2,...,4}{
    \node [neuronnode] (neuron1\n) at (\n * \neuronsep,4em) {\tiny{$f_2$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron1\n.east) -- (neuron1\n.west);
}

\foreach \n in {2,...,4}{
    \foreach \m in {1,...,5}{
        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
    }
    \visible<2>{
    \draw [<-,thick] ([yshift=1.5em]neuron1\n.north) -- (neuron1\n.north);
    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron1\n.north) {$y_\n$};
    }
}

\node [anchor=west] (w2label) at ([xshift=-2.5em,yshift=5.0em]x5.north east) {$\textbf{w}_2$};

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron12) (neuron14)] (layer02) {};
}
\end{pgfonlayer}

\node [anchor=west] (layer02label) at ([xshift=4.9em]layer02.east) {第三层};
\visible<4->{
\node [anchor=west] (layer02label2) at (layer02label.east) {(\alert{隐层})};
}
}

%%% layer 3
\visible<3->{
\foreach \n in {1,...,5}{
    \node [neuronnode] (neuron2\n) at (\n * \neuronsep,8em) {\tiny{$f_3$}\\[-1ex] \tiny{$\sum$}};
    \draw [-,ublue] (neuron2\n.east) -- (neuron2\n.west);
}

\foreach \n in {1,...,5}{
    \foreach \m in {2,...,4}{
        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
    }

    \node [anchor=south] (y\n) at ([yshift=1.5em]neuron2\n.north) {$y_\n$};
    \draw [<-,thick] ([yshift=1.5em]neuron2\n.north) -- (neuron2\n.north);
}

\node [anchor=west] (w3label) at ([xshift=-2.5em,yshift=8.5em]x5.north east) {$\textbf{w}_3$};

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron25)] (layer03) {};
}
\end{pgfonlayer}

\node [anchor=west] (layer03label) at ([xshift=1em]layer03.east) {第四层};
\visible<4->{
\node [anchor=west] (layer03label2) at (layer03label.east) {(\alert{输出层})};
}
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 两层神经网络可以逼近任何函数
\begin{frame}{多层神经网络可以逼近任意函数}
\begin{itemize}
\item 以一个简单的三层网络为例（隐层激活函数：sigmoid）
\end{itemize}

\begin{center}
\begin{tikzpicture}

%% a two-layer neural network
\begin{scope}
\tikzstyle{neuronnode} = [minimum size=1.7em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}]

%% input and hidden layers
\node [neuronnode] (n10) at (0,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\node [neuronnode] (n11) at (1.5,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\draw [-,ublue] (n10.west) -- (n10.east);
\draw [-,ublue] (n11.west) -- (n11.east);
\node [anchor=north] (x1) at ([yshift=-6em]n11.south) {$x_1$};
\node [anchor=north] (b) at ([yshift=-6em]n10.south) {$b$};
\visible<1-10>{
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n10.south);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n10.290);
}
\visible<1>{
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n11.250);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n11.south);
}

\visible<11->{
\draw [->,thick,red] (b.north) -- ([yshift=-0.1em]n10.south);
\draw [->,thick,ugreen] (x1.north) -- ([yshift=-0.1em]n10.290);
}

\visible<2->{
\draw [->,thick,blue] (b.north) -- ([yshift=-0.1em]n11.250);
\draw [->,thick,purple] (x1.north) -- ([yshift=-0.1em]n11.south);
}

\visible<15->{
\node [neuronnode] (n12) at (2.7,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\node [neuronnode] (n13) at (3.8,0) {\tiny{$f$}\\[-1ex] \tiny{$\sum$}};
\draw [-,ublue] (n12.west) -- (n12.east);
\draw [-,ublue] (n13.west) -- (n13.east);
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n12.250);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n12.270);
\draw [->,thick] (b.north) -- ([yshift=-0.1em]n13.230);
\draw [->,thick] (x1.north) -- ([yshift=-0.1em]n13.250);
}

\visible<16->{
\node [anchor=west] (morenodes) at (n13.east) {...};
}

%% output layers
\node [neuronnode] (n20) at (0.75,5em) {\scriptsize{$\sum$}};
\visible<1-10>{\draw [->,thick] ([yshift=0.1em]n10.north) -- ([yshift=-0.1em]n20.250);}
\visible<1-8>{\draw [->,thick] ([yshift=0.1em]n11.north) -- ([yshift=-0.1em]n20.290);}

\visible<11->{\draw [->,thick,brown] ([yshift=0.1em]n10.north) -- ([yshift=-0.1em]n20.250);}
\visible<9->{\draw [->,thick,orange] ([yshift=0.1em]n11.north) -- ([yshift=-0.1em]n20.290);}

\node [] (y) at ([yshift=3em]n20.north) {$y$};
\draw [->,thick] ([yshift=0.1em]n20.north) -- (y.south);

\visible<15->{
\draw [->,thick] ([yshift=0.1em]n12.north) -- ([yshift=-0.1em]n20.310);
\draw [->,thick] ([yshift=0.1em]n13.north) -- ([yshift=-0.1em]n20.330);
}

%% weight and bias
\visible<11->{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (b0) at ([yshift=3em,xshift=-0.5em]b.north) {\tiny{$b=-6$}};}
\visible<11->{\node [anchor=center,rotate=-59,fill=white,inner sep=1pt] (w2) at ([yshift=1.2em,xshift=-1.2em]x1.north) {\tiny{$w=100$}};}

\visible<2-6>{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=0$}};}
\visible<7>{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-2$}};}
\visible<8->{\node [anchor=center,rotate=59,fill=white,inner sep=1pt] (b1) at ([yshift=4.9em,xshift=2.2em]b.north) {\tiny{$b=-4$}};}
\visible<2-4>{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=1$}};}
\visible<5>{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=10$}};}
\visible<6->{\node [anchor=center,rotate=90,fill=white,inner sep=1pt] (w1) at ([yshift=3em,xshift=0.5em]x1.north) {\tiny{$w=100$}};}

\visible<11>{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=0.7$}};}
\visible<12->{\node [anchor=center,rotate=62,fill=white,inner sep=1pt] (w21) at ([yshift=2em,xshift=0.5em]n10.north) {\tiny{$w'=-0.7$}};}

\visible<2-8>{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=1$}};}
\visible<9>{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.9$}};}
\visible<10->{\node [anchor=center,rotate=-62,fill=white,inner sep=1pt] (w22) at ([yshift=2em,xshift=-0.5em]n11.north) {\tiny{$w'=0.7$}};}


%% sigmoid box
\begin{scope}
\visible<3->{
\node [anchor=west] (flabel) at ([xshift=1.2in]y.east) {\footnotesize{sigmoid:}};
\node [anchor=north east] (slabel) at ([xshift=0]flabel.south east) {\footnotesize{sum:}};

\node [anchor=west,inner sep=2pt] (flabel2) at (flabel.east) {\footnotesize{$f(s)=1/(1+e^{-s})$}};
\node [anchor=west,inner sep=2pt] (flabel3) at (slabel.east) {\footnotesize{$s=x_1 \cdot w + b$}};
\draw [->,thick,dotted] ([yshift=-0.3em,xshift=-0.1em]n11.60)  .. controls +(east:1) and +(west:2) ..  ([xshift=-0.2em]flabel.west) ;

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=0.2em,fill=blue!20,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}] [fit = (flabel) (flabel2) (flabel3)] (funcbox) {};
}
\end{pgfonlayer}
}
\end{scope}

%% output illustration
\begin{scope}[xshift=2.8in,yshift=0.1in]
\visible<4->{
\draw [->,thick] (-2.2,0) -- (2.2,0);
\draw [->,thick] (0,0) -- (0,2);
\draw [-] (-0.05,1) -- (0.05,1);
\node [anchor=east,inner sep=1pt] (label1) at (0,1) {\tiny{1}};
\node [anchor=south east,inner sep=1pt] (label2) at (0,0) {\tiny{0}};
}

\visible<4>{\draw [-,very thick,ublue,domain=-2:2,samples=100] plot (\x,{1/(1+exp(-2*\x))});}
\visible<5>{\draw [-,very thick,ublue,domain=-2:2,samples=100] plot (\x,{1/(1+exp(-4*\x))});}
\visible<6>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0,0) -- (0,1) -- (2,1);}
\visible<7>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.25,0) -- (0.25,1) -- (2,1);}
\visible<8>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,1) -- (2,1);}
\visible<9>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.9) -- (2,0.9);}
\visible<10>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (2,0.7);}
\visible<11>{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (0.7,0.7) -- (0.7,1.4) -- (2,1.4);}
\visible<12->{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.5,0) -- (0.5,0.7) -- (0.7,0.7) -- (0.7,0) -- (2,0);}
\visible<15->{\draw [-,very thick,ublue,rounded corners=0.1em] (-2,0) -- (0.7,0) -- (0.7,0.6) -- (0.9,0.6) -- (0.9,0) -- (2,0);}

\visible<14>{\draw [->,dashed] (0.6,-0.05) -- (0.6,-0.96in);}
\visible<15->{\draw [->,dashed] (0.8,-0.05) -- (0.8,-0.98in);}

\visible<4>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=1$}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<5>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_1=10$}}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<6>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_1=100$}}\\[-0ex] \scriptsize{\ $b_1=0$}};}
\visible<7>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=100$}\\[-0ex] \alert{\scriptsize{\ $b_1=-2$}}};}
\visible<8>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_1=100$}\\[-0ex] \alert{\scriptsize{\ $b_1=-4$}}};}
\visible<9>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w'_1=0.9$}}};}
\visible<10>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w'_1=0.7$}}};}
\visible<11>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\alert{\scriptsize{$w_2=100$}}\\[-0ex] \alert{\scriptsize{\ $b_2=-6$}}\\[-0ex] \alert{\scriptsize{\ $w'_2=0.7$}}};}
\visible<12>{\node [anchor=north west,align=left] (wblabel) at (-2,2) {\scriptsize{$w_2=100$}\\[-0ex] \scriptsize{\ $b_2=-6$}\\[-0ex] \alert{\scriptsize{\ $w'_2=-0.7$}}};}
\visible<13->{\node [anchor=north west,align=left] (wblabel) at (-2.5,2) {\scriptsize{这是一个}\\[-1ex] \scriptsize{step function}};}
\end{scope}

\begin{scope}[xshift=2.8in,yshift=-1.2in]

\visible<13->{
\draw [->,thick] (-2.2,0) -- (2.2,0);
\draw [->,thick] (0,0) -- (0,2);
\draw [-,very thick,red,domain=-1.98:2,samples=100] plot (\x,{0.2 * (\x +0.4)^3 + 1.2 - 0.3 *(\x + 0.8)^2});
}

\visible<14->{
\foreach \n in {0.5}{
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<15->{
\foreach \n in {0.7}{
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<16->{
\foreach \n in {-1.9,-1.7,...,1.9}{
    \pgfmathsetmacro{\result}{0.2 * (\n + 0.1 + 0.4)^3 + 1.2 - 0.3 *(\n + 0.1 + 0.8)^2};
    \draw [-,ublue,thick] (\n,0) -- (\n, \result) -- (\n + 0.2, \result) -- (\n + 0.2, 0);
}
}

\visible<14>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{函数的每一段都可}\\[-1ex] \scriptsize{由step function}\\[-1ex] \scriptsize{近似}};}
\visible<15>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{增加因层神经元}\\[-1ex] \scriptsize{可以拟合更多的}\\[-1ex] \scriptsize{部分}};}
\visible<16>{\node [anchor=north west,align=left] (wblabel) at (-2.5,2.5) {\scriptsize{理论上足够多的}\\[-1ex] \scriptsize{隐层神经元可以}\\[-1ex] \scriptsize{拟合\alert{任意函数}}};}

\end{scope}

\end{scope}

\end{tikzpicture}
\end{center}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{神经网络的简单实现 - 张量计算}

%%%------------------------------------------------------------------------------------------------------------
%%% outline: problem 2
\begin{frame}{然后}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{人工神经网络的数学描述是什么,}

\vspace{0.4em}
\textbf{如何编程实现这种数学模型？}
}
\end{tcolorbox}

\vspace{1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[yshift=6.5em,xshift=1em]
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount1}};
    \addtocounter{mycount1}{1};
  }
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\setcounter{mycount2}{2}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount2}};
    \addtocounter{mycount2}{1};
  }
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\setcounter{mycount3}{3}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount3}};
    \addtocounter{mycount3}{1};
  }
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\setcounter{mycount4}{4}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (0.5,0.5);
\foreach \y in {+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount4}};
    \addtocounter{mycount4}{1};
  }
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量
\begin{frame}{如何描述神经网络 - 张量计算}
\begin{itemize}
\item 对于神经网络，输入$\textbf{x}$和输出$\textbf{y}$的形式并不仅仅是向量
\end{itemize}

\begin{center}
\begin{tikzpicture}

\node [anchor=center] (y) at (0,0) {\LARGE{$\textbf{y}$}};
\node [anchor=west] (eq) at (y.east) {\LARGE{$=$}};
\node [anchor=west] (func) at (eq.east) {\LARGE{$f$}};
\node [anchor=west] (brace01) at (func.east) {\LARGE{$($}};
\node [anchor=west] (x) at (brace01.east) {\LARGE{$\textbf{x}$}};
\node [anchor=west] (dot) at (x.east) {\LARGE{$\cdot$}};
\node [anchor=west] (w) at (dot.east) {\LARGE{$\textbf{w}$}};
\node [anchor=west] (plus) at (w.east) {\LARGE{$+$}};
\node [anchor=west] (b) at (plus.east) {\LARGE{$\textbf{b}$}};
\node [anchor=west] (brace02) at (b.east) {\LARGE{$)$}};

\visible<2->{
\node [anchor=center,fill=yellow!30] (x2) at (x) {\LARGE{$\textbf{x}$}};
\node [anchor=south] (xlabel) at ([xshift=-3em,yshift=1.5em]x.north) {\alert{向量？矩阵？...}};
\draw [<-] ([yshift=0.2em,xshift=-0.5em]x2.north) -- ([xshift=1em]xlabel.south);

\node [anchor=center,fill=red!20] (y2) at (y) {\LARGE{$\textbf{y}$}};
\draw [<-] ([yshift=0.2em,xshift=0.5em]y2.north) -- ([xshift=-1em]xlabel.south);

\node [anchor=center,fill=green!20] (w2) at (w) {\LARGE{$\textbf{w}$}};
\node [anchor=north] (wlabel) at ([yshift=-1.0em]w.south) {矩阵 e.g.,};
\draw [<-] ([yshift=-0.2em]w2.south) -- (wlabel.north);
\node [anchor=west] (wsample) at ([xshift=-0.5em]wlabel.east) {\footnotesize{$\left(\begin{array}{c c} 1 & 2 \\ 3 & 4 \end{array}\right)$}};

\node [anchor=center,fill=purple!20] (b2) at (b) {\LARGE{$\textbf{b}$}};
\node [anchor=south] (blabel) at ([yshift=1.3em]b.north) {向量 e.g.,};
\draw [<-] ([yshift=0.2em]b2.north) -- (blabel.south);
\node [anchor=west] (bsample) at ([xshift=-0.5em]blabel.east) {\footnotesize{$(1, 3)$}};
}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<3-> $\textbf{x}$和$\textbf{y}$实际上是一个叫tensor的东西，即\textbf{张量}。比如，
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\visible<4->{\node [anchor=west] (vector) at (0,0) {$\textbf{x} = (1,  3)$};}
\visible<5->{\node [anchor=west] (matrix) at ([xshift=0.1in]vector.east) {$\textbf{x} = \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right)$};}
\visible<6->{\node [anchor=west] (tensor3d) at ([xshift=0.1in]matrix.east) {啥？$\textbf{x} = \left(\begin{array}{c} \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right) \\ \left(\begin{array}{c c} -1 & 3 \\ 0.2 & 2 \end{array}\right) \end{array}\right)$};}
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量的简单定义
\begin{frame}{张量是什么}
\begin{itemize}
\item \textbf{深度学习}中，张量被``简单"地定义为\alert{多维数组}
    \begin{itemize}
    \item 张量的阶（rank）表示有多少个独立的方向，每个方向可以由多个维度表示
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}

\begin{scope}
\visible<2->{
\node [anchor=north] (label) at (0,0) {标量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {scalar};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=0)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {\Huge{3}};
}
\end{scope}

\begin{scope}[xshift=1in]
\visible<3->{
\node [anchor=north] (label) at (0,0) {向量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {vector};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=1)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {$\begin{pmatrix} 2 \\ .3 \\ -8 \\ .2\end{pmatrix}$};
}
\end{scope}

\begin{scope}[xshift=2in]
\visible<4->{
\node [anchor=north] (label) at (0,0) {矩阵};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {matrix};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=2)};
\node [anchor=center] (scalar) at ([yshift=5em]label.north) {$\begin{pmatrix} 1 & 1 & 9 \\ 1 & 0 & 0 \\ 1 & -4 & 7 \end{pmatrix}$};
}
\end{scope}

\begin{scope}[xshift=3.2in]
\visible<5->{
\node [anchor=north] (label) at (0,0) {3阶张量};
\node [anchor=center] (label2) at ([yshift=-0.7em]label.south) {tensor};
\node [anchor=center] (rank) at ([yshift=-1.5em]label2.center) {(rank=3)};
}
\begin{scope}[yshift=6.5em,xshift=1em]
\visible<5->{
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount1}};
    \addtocounter{mycount1}{1};
  }
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\visible<5->{
\setcounter{mycount2}{1}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount2}};
    \addtocounter{mycount2}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\visible<5->{
\setcounter{mycount3}{1}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount3}};
    \addtocounter{mycount3}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\visible<5->{
\setcounter{mycount4}{1}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {\number\value{mycount4}};
    \addtocounter{mycount4}{1};
  }
}
\end{scope}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量是一个多维线性函数
\begin{frame}{事实上，张量不是简单的多维数组 - 别慌，了解下 :)}
\begin{itemize}
\item \textbf{非常负责任的说}，张量\alert{不是}向量和矩阵的简单扩展，甚至说，多维数组\alert{也不是}张量所必须的表达形式
\item<2-> 严格意义上，张量是：
    \begin{enumerate}
    \item<2-> \textbf{看不懂的定义}：由若干坐标系改变时满足一定坐标转化关系的抽象对象，它是一个不随参照系的坐标变换而变化的几何量（几何定义）
    \item<3-> \textbf{还是看不懂的定义}：若干向量和协向量通过张量乘法定义的量（代数定义）
    \item<4-> \textbf{还可以解释的定义}：\alert{张量是多重线性函数}，是定义在一些向量空间和笛卡儿积上的多重线性映射
        \begin{itemize}
        \item 张量记为$T(v_0,...,v_r)$，其中输入是$r$个向量$\{v_0,...,v_r\}$
        \item 多重线性是指，对于每个输入，函数都是线性的，比如，对于一个$v_i$，我们有
        \vspace{-0.3em}
        \begin{displaymath}
        T(v_0,...,v_i+c \cdot u,...,v_r) = T(v_0,...,v_i,...,v_r) + c \cdot T(v_0,...,u,...,v_r)
        \end{displaymath}
        其中，$c$为任意数。这个性质非常重要，它可以推导出前面的其它定义。
        \end{itemize}
    \end{enumerate}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 进一步解释一下张量的定义
\begin{frame}{张量是一个``量''，不是``矩阵''}
\begin{itemize}
\item 再理解一下，
    \begin{itemize}
    \item 如果一个物理量，在物体的某个位置上只是一个单值，它就是标量，比如密度
    \item 如果它在同一个位置、从多个的方向上看，有不同的值，而且这个数恰好用矩阵乘观察方向来算出来，就是张量(rank$>$1)，比如应力张量
    \end{itemize}
\end{itemize}

\vspace{-0.8em}
\begin{center}
\tdplotsetmaincoords{50}{140}
\begin{tikzpicture}[scale=2,tdplot_main_coords]
\visible<3->{
\draw[thick,->] (0,0,0) -- (1,0,0) node[anchor=north east]{$a$};
\draw[thick,->] (0,0,0) -- (0,1,0) node[anchor=north west]{$b$};
\draw[thick,->] (0,0,0) -- (0,0,1) node[anchor=south]{$c$};
}
\pgfmathsetmacro{\ax}{2}
\pgfmathsetmacro{\ay}{2}
\pgfmathsetmacro{\az}{1}
\tdplotsetrotatedcoords{20}{40}{00}
\visible<4->{
\draw[thick,color=red,tdplot_rotated_coords,->] (0,0,0)
        -- (.7,0,0) node[anchor=east]{$a'$};
\draw[thick,color=green!50!black,tdplot_rotated_coords,->] (0,0,0)
        -- (0,.7,0) node[anchor=west]{$b'$};
\draw[thick,color=blue,tdplot_rotated_coords,->] (0,0,0)
        -- (0,0,.7) node[anchor=south]{$c'$};
}
\tdplottransformmainrot{\ax}{\ay}{\az}

\visible<3->{\node [anchor=west,inner sep=2pt] (coord1) at (-0.40in,-0.4in) {\footnotesize{方向$v=(a,b,c)$}};}
\visible<4->{\node [anchor=north west,inner sep=2pt] (coord2) at (coord1.south west) {\footnotesize{方向$u=(\red{a'}\black{,}{\color{ugreen} b'}\black{,}\blue{c'}\black{)}$}};}

\begin{scope}[xshift=0.4in,yshift=0.35in]
\visible<2->{
\node [anchor=west,inner sep = 2pt] (description) at (0,0) {\small{$T(v,u)$是一个三维空间$(x,y,z)$上的}};
\node [anchor=north west,inner sep = 2pt] (description2) at (description.south west) {\small{2阶张量，其中$v$和$u$是两个向量}};
}

\visible<5->{
\node [anchor=north west,inner sep=2pt] (T) at ([yshift=-2em]description2.south west) {\small{$T(v,u)=$}};
\node [anchor=west,inner sep=1pt] (T2) at (T.east) {\footnotesize{$\begin{pmatrix} v_x \\ v_y \\ v_z \end{pmatrix}^T$}};
\node [anchor=west,inner sep=1pt] (T3) at ([xshift=2pt]T2.east) {\footnotesize{$\begin{pmatrix} T_{xx} & T_{xy} & T_{xz} \\ T_{yx} & T_{yy} & T_{yz} \\ T_{zx} & T_{zy} & T_{zz} \end{pmatrix}$}};
\node [anchor=west,inner sep=1pt] (T4) at ([xshift=2pt]T3.east) {\footnotesize{$\begin{pmatrix} u_x \\ u_y \\ u_z \end{pmatrix}$}};
}
\begin{pgfonlayer}{background}
\visible<7->{
\node [rectangle,inner sep=0pt,fill=red!20,minimum height=3.5em,minimum width=7em] [fit = (T3) ] (TBox) {};
}
\visible<6->{
\node [rectangle,inner sep=0pt,fill=green!20,minimum height=3.5em,minimum width=3em] [fit = (T2) ] (VBox) {};
\node [rectangle,inner sep=0pt,fill=blue!20,minimum height=3.5em,minimum width=2.5em] [fit = (T4) ] (UBox) {};
}
\end{pgfonlayer}

\visible<6->{
\draw [<-] (VBox.north) -- ([yshift=0.3em]VBox.north);
\node [anchor=south,align=left] (Vlabel) at ([yshift=0.3em]VBox.north) {\scriptsize{$v$在基向量上的投影}};
\draw [<-] (UBox.north) -- ([yshift=0.3em]UBox.north);
\node [anchor=south,align=left] (Ulabel) at ([yshift=0.3em,xshift=-1em]UBox.north) {\scriptsize{$u$在基向量上的投影}};
}
\visible<7->{
\draw [<-] (TBox.south) -- ([yshift=-0.3em]TBox.south);
\node [anchor=north,align=left] (Vlabel) at ([xshift=-0.5em,yshift=-0.3em]TBox.south) {\scriptsize{张量在$3 \times 3$个方向上的分量，恰巧用``矩阵''表示，}};
\node [anchor=north west,align=left] (Vlabel2) at ([yshift=0.2em]Vlabel.south west) {\scriptsize{记为$[T]$，想象一下坐标系的旋转}};
}
\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何在深度学习中定义一个张量
\begin{frame}{``矩阵''是``张量''的扩展：在神经网络中使用张量}
\begin{itemize}
\item 但是前面的可以忽略 - 在这里，``\alert{张量就是多维数组}''
    \begin{itemize}
    \item 向量、矩阵都可以看做数学上的``张量''的扩展
    \end{itemize}

\item<2-> 张量$T(1:3)$表示一个向量，有三个元素\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储：};
\draw[step=0.5cm,thick] (0,-0.5) grid (1.5,0);
\setcounter{mycount1}{1}
\foreach \x in {0.25,0.75,1.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
}
\end{scope}
\end{tikzpicture}

\item<3-> 张量$T(1:2,1:3)$表示一个$3 \times 2$的矩阵\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储：};
\draw[step=0.5cm,thick] (0,-0.5) grid (3.0,0);
\setcounter{mycount2}{1}
\foreach \x in {0.25,0.75,1.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount2}$};
    \addtocounter{mycount2}{1};
}
\foreach \x in {1.75,2.25,2.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount2}$};
    \addtocounter{mycount2}{1};
}
\end{scope}
\end{tikzpicture}

\item<4-> 张量$T(1:2,1:2,1:3)$表示一个三阶张量，大小是$3 \times 2 \times 2$\\
\vspace{0.5em}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north east, inner sep=1pt] (label) at (0,0) {物理存储：};
\draw[step=0.5cm,thick] (0,-0.5) grid (6.0,0);
\setcounter{mycount3}{1}
\foreach \x in {0.25,0.75,1.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
    \addtocounter{mycount3}{1};
}
\foreach \x in {1.75,2.25,2.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
    \addtocounter{mycount3}{1};
}
\foreach \x in {3.25,3.75,4.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
    \addtocounter{mycount3}{1};
}
\foreach \x in {4.75,5.25,5.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,-0.25) {$\number\value{mycount3}$};
    \addtocounter{mycount3}{1};
}
\draw[decorate,thick,decoration={brace,mirror,raise=0.2em}] (0,-0.50) -- (2.95,-0.50);
\draw[decorate,thick,decoration={brace,mirror,raise=0.2em}] (3.05,-0.50) -- (6,-0.50);
\node [anchor=north] (subtensor1) at (1.5,-0.6) {\footnotesize{$3 \times 2$ sub-tensor}};
\node [anchor=north] (subtensor1) at (4.5,-0.6) {\footnotesize{$3 \times 2$ sub-tensor}};

\end{scope}
\end{tikzpicture}

\item<5-> 高阶张量：数组！数组！数组！
    \begin{itemize}
    \item 和C++、Python中的多维数组一模一样
    \end{itemize}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量矩阵乘法
\begin{frame}{张量的矩阵乘法}
\begin{itemize}
\item 对于神经网络$\textbf{y}=f(\textbf{x}\cdot \textbf{w} + \textbf{b})$，$\textbf{x} \cdot \textbf{w}$或$\textbf{x} \times \textbf{w}$是线性变换，其中$\textbf{x}$是输入张量，$\textbf{w}$是一个矩阵
    \begin{itemize}
    \item $\textbf{x} \cdot \textbf{w}$表示的是矩阵乘法（或记为$\times$）
    \item 注意，这里不是张量乘法，因为张量乘法还有其它定义
    \item $\textbf{w}$是$n \times m$的矩阵，$\textbf{x}$的形状是$... \times n$，即$\textbf{x}$的第一维度需要和$\textbf{w}$的行数大小相等\\
    \vspace{0.5em}
    $\textbf{x}(1:4,1:4,\alert{1:4}) \times \textbf{w}(\alert{1:4},1:2) = \textbf{s}(1:4,1:4,1:2)$
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}

\begin{scope}[yshift=6.5em,xshift=1em]
\visible<2->{
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em]
\visible<2->{
\setcounter{mycount2}{2}
\draw[step=0.5cm,color=blue,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount2}$};
    \addtocounter{mycount2}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em]
\visible<2->{
\setcounter{mycount3}{3}
\draw[step=0.5cm,color=ugreen,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount3}$};
    \addtocounter{mycount3}{1};
  }
}
\end{scope}

\begin{scope}[yshift=5em,xshift=-0.5em]
\visible<2->{
\setcounter{mycount4}{4}
\draw[step=0.5cm,color=red,thick] (-1,-1) grid (1,1);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount4}$};
    \addtocounter{mycount4}{1};
  }
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x}$};
}
\end{scope}

\begin{scope}[yshift=5em,xshift=1.5in]
\visible<2->{
\draw[step=0.5cm,thick] (-0.5,-1) grid (0.5,1.0);
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,0.75) {\small{$-1$}};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,0.25) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,-0.25) {$1$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (-0.25,-0.75) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,0.75) {$0$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,0.25) {\small{$-1$}};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,-0.25) {$1$};
\node [fill=black!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (0.25,-0.75) {$0$};
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{w}$};
}

\visible<3>{\draw [->,thick,dashed] (-1.5in+2em+1.5em,-0.3) .. controls +(east:2) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<4>{\draw [->,thick,dashed] (-1.5in+2em+1.0em,-0.5) .. controls +(east:2) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<5>{\draw [->,thick,dashed] (-1.5in+2em+0.5em,-0.7) .. controls +(east:2.5) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\visible<6->{\draw [->,thick,dashed] (-1.5in+2em,-0.9) .. controls +(east:3) and +(west:1) .. (-0.55,0.8) node [pos=0.5,left] {\scriptsize{\textbf{矩阵乘}}};}
\end{scope}

\begin{scope}[yshift=6.5em,xshift=1em+3in]
\visible<3->{
\draw[step=0.5cm,color=orange,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=6em,xshift=0.5em+3in]
\visible<4->{
\draw[step=0.5cm,color=blue,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
    \node [fill=blue!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=5.5em,xshift=0em+3in]
\visible<5->{
\draw[step=0.5cm,color=ugreen,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{-1};
  }
}
}
\end{scope}

\begin{scope}[yshift=5.0em,xshift=-0.5em+3in]
\visible<6->{
\draw[step=0.5cm,color=red,thick] (-0.5,-1) grid (0.5,1.0);
\foreach \y in {+0.75,+0.25,-0.25,-0.75}{
  \setcounter{mycount1}{2}
  \foreach \x in {-0.25,0.25}{
    \node [fill=red!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{-1};
  }
}
}

\visible<3->{
\node [anchor=north] (xlabel) at (0,-1.2) {$\textbf{x} \cdot \textbf{w}$};
\node [anchor=center] (elabel) at (-0.7in,0) {\Huge{$\textbf{=}$}};
}
\end{scope}

\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 张量的单元操作
\begin{frame}{张量的单元操作}
\begin{itemize}
\item 神经网络$\textbf{y}=f(\textbf{x}\cdot \textbf{w} + \textbf{b})$也包括一些张量的单元操作（element-wise operation）
	\begin{itemize}
	\item 加法：$\textbf{s}+\textbf{b}$，其中$\textbf{s} = \textbf{x}\cdot \textbf{w}$
	\item 激活函数：$f(\cdot)$
	\end{itemize}
\item<2-> \textbf{单元加}就是对张量中的每个位置都进行加法
	\begin{itemize}
	\item<3-> 扩展：加法的\textbf{广播}，重复利用一个张量进行加法，并不要求两个张量形状相同
	\end{itemize}
\end{itemize}

\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\visible<3->{
\begin{scope}
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=orange,thick] (-1,-0.5) grid (1,0.5);
\foreach \y in {+0.25,-0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s}$};
\end{scope}
\begin{scope}[xshift=1.5in]
\setcounter{mycount1}{1}
\draw[step=0.5cm,color=ugreen,thick] (-1,-0) grid (1,0.5);
\foreach \y in {+0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=green!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$1$};
    \addtocounter{mycount1}{1};
  }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{+}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{b}$};
\end{scope}
\begin{scope}[xshift=3in]
\setcounter{mycount1}{2}
\draw[step=0.5cm,color=orange,thick] (-1,-0.5) grid (1,0.5);
\foreach \y in {+0.25,-0.25}
  \foreach \x in {-0.75,-0.25,0.25,0.75}{
    \node [fill=orange!20,inner sep=0pt,minimum height=0.49cm,minimum width=0.49cm] at (\x,\y) {$\number\value{mycount1}$};
    \addtocounter{mycount1}{1};
  }
\node [anchor=center] (plabel) at (-4.5em,0) {\huge{$\textbf{=}$}};
\node [anchor=south] (varlabel) at (0,0.6) {$\textbf{s+b}$};
\end{scope}
}

\end{tikzpicture}
\end{center}

\vspace{-0.3em}

\begin{itemize}
\item<4-> 类似的，我们可以做减法、乘法，也包括激活函数。这也被称作函数的向量化（vectorization）
\end{itemize}

\vspace{-0.5em}
\visible<4->{
\begin{displaymath}
\textrm{Relu} \Big( \begin{pmatrix} 2 \\ -.3 \end{pmatrix} \Big) = \begin{pmatrix} 2 \\ 0 \end{pmatrix}
\end{displaymath}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 深度学习工具包
\begin{frame}{如何实现？- 开源张量计算框架}
\begin{itemize}
\item 实现神经网络的开源系统很多，一个简单好用的工具包NumPy \url{https://numpy.org/}
    \begin{itemize}
    \item Python接口，多维数组的定义使用方便
    \item 提供了张量表示和使用的范式
    \end{itemize}
\item<2-> 最近，很火的两个框架：TensorFlow和PyTorch
    \begin{itemize}
    \item Google和Facebook出品，质量有保证
    \item 功能强大，接口丰富
    \item 可以进行大规模部署和应用
    \item 大量可参考的实例
    \end{itemize}

    \includegraphics[scale=0.13]{./Figures/tensorflowpytorch.jpg}
\item<3-> 还有其它还在更新的优秀框架： CNTK、MXNet、PaddlePaddle、Keras、Chainer、 dl4j、NiuTensor等
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% NiuTrans.Tensor工具包
\begin{frame}{NiuTensor}
\begin{itemize}
\item 这里使用我们自研的NiuTensor工具包进行教学 \url{http://www.niutrans.com/opensource/niutensor/index.html}
    \begin{itemize}
    \item 简单小巧，易于修改
    \item C++语言编写，代码高度优化
    \item 同时支持CPU和GPU设备
    \item 丰富的张量计算接口
    \end{itemize}
\end{itemize}

\includegraphics[scale=0.35]{./Figures/niutensor.jpg}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 使用NiuTensor
\begin{frame}{使用NiuTensor}
\begin{itemize}
\item NiuTensor的使用很简单，下面是一个C++例子
\end{itemize}

\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{\#include "source/tensor/XTensor.h"} \hspace{4em} \= // 引用XTensor定义的头文件 \\

\texttt{using namespace nts;} \> // 引用nts命名空间 \\
\ \\

\texttt{int main(int argc, const char ** argv)\{} \\
\ \ \ \ \texttt{XTensor tensor;} \> // 声明张量tensor \\

\ \ \ \ \texttt{InitTensor2D(\&tensor, 2, 2, X\_FLOAT);} \> // 定义张量为2*2的矩阵 \\

\ \ \ \ \texttt{tensor.SetDataRand();} \> // [0,1]均匀分布初始化张量 \\

\ \ \ \ \texttt{tensor.Dump(stdout);} \> // 输出张量内容 \\

\ \ \ \ \texttt{return 0;}\\
\texttt{\}}

\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\begin{itemize}
\item<2-> 运行这个程序会显示张量每个元素的值
\begin{itemize}
\item<2-> 二阶张量(order=2)，形状是$2 \times 2$ (dimsize=2,2)，数据类型是单精度浮点(dtype=X\_FLOAT)，非稀疏(dense=1.00)
\end{itemize}
\end{itemize}

\vspace{-0em}
\visible<2->{
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=black!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{order=2 dimsize=2,2 dtype=X\_FLOAT dense=1.000000} \\
\texttt{3.605762e-001 2.992340e-001 1.393780e-001 7.301248e-001}
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 定义XTensor
\begin{frame}{定义XTensor}
\begin{itemize}
\item 张量由类XTensor表示，利用InitTensor定义，参数：
    \begin{itemize}
    \item 指向XTensor类型变量的指针
    \item 张量的阶
    \item 各个方向维度的大小（与传统多维数组约定一样）
    \item 张量的数据类型等（有缺省值）
    \end{itemize}
\end{itemize}

\vspace{-0.3em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor tensor;} \hspace{12em} \= // 声明张量tensor \\
\texttt{int sizes[6] = \{2,3,4,2,3,4\};} \> // 张量的形状为2*3*4*2*3*4 \\
\texttt{InitTensor(\&tensor, 6, sizes, X\_FLOAT);} \> // 定义形状为sizes的6阶张量
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 更简便的定义方式
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{11.5em} \= // 声明张量tensor \\
\texttt{InitTensor1D(\&a, 10, X\_INT);} \> // 10维的整数型向量\\
\texttt{InitTensor1D(\&b, 10);} \> // 10维的向量，缺省类型(浮点)\\
\texttt{InitTensor4D(\&c, 10, 20, 30, 40);} \> // 10*20*30*40的4阶张量(浮点)
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\visible<3->{
\begin{itemize}
\item 直接在GPU上定义张量
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor tensorGPU;} \hspace{10.5em} \= // 声明张量tensor \\
\texttt{InitTensor2D(\&tensorGPU, 10, 20,} $\backslash$ \> // 在编号为0的GPU上定义张量 \\
\hspace{6.7em} \texttt{X\_FLOAT, 0);}
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% XTensor的代数运算
\begin{frame}{代数运算}
\begin{itemize}
\item 各种单元算子（1阶运算）+、-、*、$\backslash$、Log、Exp、 Power、Absolute等，还有Sigmoid、Softmax等激活函数
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c, d, e;} \hspace{7em} \= // 声明张量tensor \\
\texttt{InitTensor3D(\&a, 2, 3, 4);} \> // a为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&b, 2, 3, 4);} \> // b为2*3*4的3阶张量 \\
\texttt{InitTensor3D(\&c, 2, 3, 4);} \> // c为2*3*4的3阶张量 \\
\texttt{a.SetDataRand();} \> // 随机初始化a \\
\texttt{b.SetDataRand();} \> // 随机初始化b \\
\texttt{c.SetDataRand();} \> // 随机初始化c \\
\texttt{d = a + b * c;} \> // d被赋值为 a + b * c \\
\texttt{d = ((a + b) * d - b / c ) * d;} \> // d可以被嵌套使用 \\
\texttt{e = Sigmoid(d);} \> // d经过激活函数Sigmoid赋值给e
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 高阶运算，最常用的是矩阵乘法(MMul)
\end{itemize}

\vspace{-0.2em}
\begin{tcolorbox}[enhanced,frame engine=empty,boxrule=0.1mm,size=title,colback=blue!10!white]
\begin{flushleft}
{\scriptsize
\begin{tabbing}
\texttt{XTensor a, b, c;} \hspace{10.0em} \= // 声明张量tensor \\
\texttt{InitTensor4D(\&a, 2, 2, 3, 4);} \> // a为2*2*3*4的4阶张量 \\
\texttt{InitTensor2D(\&b, 4, 5);} \> // b为4*5的矩阵 \\
\texttt{a.SetDataRand();} \> // 随机初始化a \\
\texttt{b.SetDataRand();} \> // 随机初始化b \\
\texttt{c = MMul(a, b);} \> // 矩阵乘的结果为2*2*3*5的4阶张量
\end{tabbing}
}
\end{flushleft}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% XTensor的其它函数
\begin{frame}{其它常用函数}
\begin{itemize}
\item 其它函数，列不全，可以参考网站上的详细说明
\end{itemize}

\footnotesize{
\begin{center}
\begin{tabular}{l|l}
函数 & 描述 \\ \hline
\texttt{a.Reshape(o, s)} & 把a变换为阶为o、形状为s的张量\\
\texttt{a.Get(pos)} & 取张量中位置为pos的元素 \\
\texttt{a.Set(v, pos)} & 把张量中位置为pos的元素的值设为v \\
\texttt{a.Dump(file)} & 把张量存到file中，file为文件句柄 \\
\texttt{a.Read(file)} & 从file中读取张量，file为文件句柄 \\ \hline
\texttt{Power(a, p)}  & 计算指数$\textrm{a}^{\textrm{p}}$ \\
\texttt{Linear(a, s, b)}  & 计算 a * s + b，s和b都是一个数 \\
\texttt{CopyValues(a)} & 构建a的一个拷贝 \\
\texttt{ReduceMax(a, d)} & 对a沿着方向d进行规约，得到最大值 \\
\texttt{ReduceSum(a, d)} & 对a沿着方向d进行规约，得到和 \\
\texttt{Concatenate(a, b, d)} & 把两个张量a和b沿d方向级联\\
\texttt{Merge(a, d)} & 对张量a沿d方向合并\\
\texttt{Split(a, d, n)} & 对张量a沿d方向分裂成n份\\ \hline
\texttt{Sigmoid(a)}  & 对a进行Sigmoid变换 \\
\texttt{Softmax(a)}  & 对a进行Softmax变换，沿最后一个方向 \\
\texttt{HardTanH(a)} & 对a进行hard tanh变换(双曲正切的近似)\\
\texttt{Relu(a)}     & 对a进行Relu变换\\
\end{tabular}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 利用XTensor构建神经网络
\begin{frame}{构建神经网络}
\begin{itemize}
\item 可以很方便的构建一个单层网络
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, w, b;} \\
\texttt{InitTensor3D(\&x, 3, 4, 5);} \\
\texttt{InitTensor2D(\&w, 5, 3);} \\
\texttt{InitTensor1D(\&b, 3);} \\
\texttt{...} \\
\texttt{y = Sigmoid(MMul(x, w) + b);}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x) at (0,0) {\footnotesize{$\textrm{x}$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer) at ([yshift=0.7em]x.north) {\scriptsize{layer}};
\node [anchor=south,draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y) at ([yshift=0.7em]layer.north) {\scriptsize{$\textrm{y}$}};
\draw [thick,->] (x.north) -- (layer.south);
\draw [thick,->] (layer.north) -- (y.south);
\node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
\node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*3}};
\end{tikzpicture}
\end{center}
\end{tcolorbox}

\visible<2->{
\begin{itemize}
\item 一个多层网络
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, h1, h2;} \\
\texttt{XTensor w1, b1, w2, w3;} \\
\texttt{InitTensor3D(\&x, 3, 4, 5);} \\
\texttt{InitTensor2D(\&w1, 5, 3);} \\
\texttt{InitTensor1D(\&b1, 3);} \\
\texttt{InitTensor2D(\&w2, 3, 6);} \\
\texttt{InitTensor2D(\&w3, 6, 4);} \\
\texttt{...} \\
\texttt{h1 = Sigmoid(MMul(x, w1) + b1);} \\
\texttt{h2 = HandTanH(MMul(h1, w2));} \\
\texttt{y = Relu(MMul(h2, w3));}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x) at (0,0) {\footnotesize{$\textrm{x}$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer1) at ([yshift=0.7em]x.north) {\scriptsize{layer1}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer2) at ([yshift=1.0em]layer.north) {\scriptsize{layer2}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=4em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (layer3) at ([yshift=1.0em]layer2.north) {\scriptsize{layer3}};
\node [anchor=south,draw,circle,inner sep=2pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y) at ([yshift=0.7em]layer3.north) {\scriptsize{$\textrm{y}$}};
\draw [thick,->] (x.north) -- (layer1.south);
\draw [thick,->] (layer1.north) -- (layer2.south);
\draw [thick,->] (layer2.north) -- (layer3.south);
\draw [thick,->] (layer3.north) -- (y.south);
\node [anchor=west,align=left] (xshape) at (x.east) {\tiny{shape: 3*4*5}};
\node [anchor=west,align=left] (yshape) at (y.east) {\tiny{shape: 3*4*4}};
\node [anchor=south west,align=left,inner sep=2pt] (l1shape) at (layer1.north) {\tiny{shape: 3*4*3}};
\node [anchor=south west,align=left,inner sep=2pt] (l2shape) at (layer2.north) {\tiny{shape: 3*4*6}};
\end{tikzpicture}
\end{center}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 利用XTensor构建更复杂的神经网络
\begin{frame}{更复杂一点的例子}
\begin{itemize}
\item 任何网络都可以构建，比如RNN、Transformer 等
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x[3], y[3], r, wh;} \\
\texttt{XTensor h1, h2, w1, b1, h3, h4;} \\
\texttt{XList splits;} \\
\texttt{...} \\
\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
\texttt{\hspace{2em}r = Concatenate(x[i] + r) * wh;}\\
\texttt{\hspace{2em}splits.Add(\&r);}\\
\texttt{\}}\\
\visible<2->{
\texttt{} \\
\texttt{h1 = Merge(splits, 0);}\\
\texttt{h2 = Relu(h1 * w1 + b1);}\\
\texttt{h3 = h1 + h2;} \\
\texttt{h4 = Softmax(h3);} \\
}
\visible<3->{
\texttt{} \\
\texttt{Split(h4, splits, 0);} \\
\texttt{} \\
\texttt{for(unsigned i = 0; i < 3; i++)\{} \\
\texttt{\hspace{2em}y[i] = *(XTensor*)splits.Get(i);}\\
\texttt{\hspace{2em}y[i].Dump(stdout);}\\
\texttt{\}}
}
\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\node [draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x1) at (0,0) {\footnotesize{$\textrm{x}_1$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x2) at ([xshift=2em]x1.east) {\footnotesize{$\textrm{x}_2$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (x3) at ([xshift=2em]x2.east) {\footnotesize{$\textrm{x}_3$}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer1) at ([yshift=1em]x1.north) {\tiny{rlayer}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer2) at ([yshift=1em]x2.north) {\tiny{rlayer}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=2.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (rlayer3) at ([yshift=1em]x3.north) {\tiny{rlayer}};
\draw [->,thick] (x1.north) -- (rlayer1.south);
\draw [->,thick] (x2.north) -- (rlayer2.south);
\draw [->,thick] (x3.north) -- (rlayer3.south);
\draw [->,thick] (rlayer1.east) -- (rlayer2.west);
\draw [->,thick] (rlayer2.east) -- (rlayer3.west);
\draw [->,thick] (rlayer1.north) -- ([yshift=1em]rlayer1.north);
\draw [->,thick] (rlayer2.north) -- ([yshift=1em]rlayer2.north);
\draw [->,thick] (rlayer3.north) -- ([yshift=1em]rlayer3.north);

\visible<2->{
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at ([yshift=1em]rlayer2.north) {\tiny{h1 = Merge($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1em]h1.north) {\tiny{h2 = Relu($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1em]h2.north) {\tiny{h3 = Sum($\cdot$)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1em]h3.north) {\tiny{h4 = Softmax($\cdot$)}};
\draw [->,thick] (h1.north) -- (h2.south);
\draw [->,thick] (h2.north) -- (h3.south);
\draw [->,thick] (h3.north) -- (h4.south);
\draw [->,thick,rounded corners] (h1.east) -- ([xshift=0.5em]h1.east) -- ([xshift=0.5em,yshift=0.5em]h2.north east) -- ([xshift=-2em,yshift=0.5em]h2.north east) -- ([xshift=-2em,yshift=1em]h2.north east);
}

\visible<3->{
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=9.4em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1em]h4.north) {\tiny{Split($\cdot$)}};
\node [anchor=south,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y2) at ([yshift=1em]slayer.north) {\footnotesize{$\textrm{y}_2$}};
\node [anchor=east,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y1) at ([xshift=-2em]y2.west) {\footnotesize{$\textrm{y}_1$}};
\node [anchor=west,draw,circle,inner sep=1pt,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (y3) at ([xshift=2em]y2.east) {\footnotesize{$\textrm{y}_3$}};
\draw [<-,thick] (y1.south) -- ([yshift=-1em]y1.south);
\draw [<-,thick] (y2.south) -- ([yshift=-1em]y2.south);
\draw [<-,thick] (y3.south) -- ([yshift=-1em]y3.south);
}

\visible<2->{
\draw [->,thick] (h4.north) -- (slayer.south);
}

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\end{frame}


%%%------------------------------------------------------------------------------------------------------------
\subsection{参数学习 - 反向传播}

%%%------------------------------------------------------------------------------------------------------------
%%% outline: problem 3
\begin{frame}{还有一个问题}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{如何对模型中的参数进行学习,}

\vspace{0.4em}
\textbf{之后使用学习到的模型进行推断？}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
\node [fill=blue!10] (label) at (0,0) {\LARGE{$\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = $ ? }};
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经网络 = 表达式
\begin{frame}{神经网络 = 函数表达式}
\begin{itemize}
\item 所有的神经网络都可以看做由变量和函数组成的表达式\\
\end{itemize}

\begin{center}
\begin{tikzpicture}
\node [anchor=north west] (eq1) at (0,0) {$\textbf{y} = \textbf{x} + \textbf{b}$};
\node [anchor=north west] (eq2) at (eq1.south west) {$\textbf{y} = \textrm{Relu}(\textbf{x} \cdot \textbf{w} + \textbf{b})$};
\node [anchor=north west] (eq3) at (eq2.south west) {$\textbf{y} = (\textrm{Relu}(\textbf{x} \cdot \textbf{w}_1 + \textbf{b}) + \textbf{x}) \cdot \textbf{w}_2$};
\node [anchor=north west] (eq4) at (eq3.south west) {$\textbf{y} = \textrm{Sigmoid}(\textrm{Relu}(\textbf{x} \cdot \textbf{w}_1 + \textbf{b}_1) + \textbf{x}) \cdot \textbf{w}_2 + \textbf{b}_2$};

\visible<2->{
\node [anchor=north west,minimum height=1.2em,minimum width=1.2em,fill=green!30!white] (xlabel) at ([yshift=-0.5em,xshift=0.3em]eq4.south west) {};
\node [anchor=west] (xlabel2) at (xlabel.east) {输入变量 - 由用户指定};
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [anchor=south, minimum height=1.6em,minimum width=0.8em,fill=green!30!white] (x1) at ([xshift=-1.3em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=0.8em,fill=green!30!white] (x2) at ([xshift=4.9em]eq4.south) {};
}
\end{pgfonlayer}

\visible<3->{
\node [anchor=north west,minimum height=1.2em,minimum width=1.2em,fill=red!30!white] (wlabel) at ([yshift=-0.3em]xlabel.south west) {};
\node [anchor=west] (wlabel2) at (wlabel.east) {模型参数 - 怎么设置？？？};
}

\begin{pgfonlayer}{background}
\visible<3->{
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (w1) at ([xshift=0.2em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (b1) at ([xshift=2.5em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (w2) at ([xshift=6.85em]eq4.south) {};
\node [anchor=south, minimum height=1.6em,minimum width=1.2em,fill=red!30!white] (b2) at ([xshift=9.2em]eq4.south) {};
}
\end{pgfonlayer}

\end{tikzpicture}
\end{center}

\visible<4->{
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
{\Large
\textbf{问题来了：}

\vspace{0.4em}
\textbf{如何确定w和b，使x与y对应得更好？}
}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 学习的目标是什么
\begin{frame}{目标函数和损失函数}
\begin{itemize}
\item 这是一个典型的优化问题，有两个基本问题\\
    \begin{enumerate}
    \item 优化的目标是什么？
    \item 如何调整参数$\textbf{w}$和$\textbf{b}$达成目标？
    \end{enumerate}
\item<2-> \textbf{定义目标}：对于给定$\textbf{x}$，什么样的$\textbf{y}$是好的
    \begin{itemize}
    \item 假设：多个输入样本$\{\textbf{x}_1,...,\textbf{x}_n\}$，每个$\textbf{x}_i$都对应\alert{正确答案}$\tilde{\textbf{y}}_i$
    \item 对于一个神经网络$\textbf{y}=f(\textbf{x})$，每个$\textbf{x}_i$也会有一个输出$\textbf{y}_i$
    \item 如果可以度量答案$\tilde{\textbf{y}}_i$和网络输出$\textbf{y}_i$之间的偏差，进而调整网络参数减小这种偏差，就可以得到更好的模型
    \end{itemize}
\end{itemize}

\visible<3->{
\vspace{-0.7em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[yscale=0.2,xscale=0.8]
\draw[-,very thick,ublue,domain=-4.2:3.5,samples=100] plot (\x,{ - 1/14 * (\x + 4) * (\x + 1) * (\x - 1) * (\x - 3)});
\visible<4->{
\draw[-,very thick,ugreen,domain=-3.8:3.0,samples=100] plot (\x,{ - 1/14 * (4*\x*\x*\x + 3*\x*\x - 26*\x - 1)});
}
\draw[->,thick] (-6,0) -- (5,0);
\draw[->,thick] (-5,-4) -- (-5,5);

\draw [<-] (-2.5,4) -- (-2,5) node [pos=1,right,inner sep=2pt] {\footnotesize{答案$\tilde{\textbf{y}}_i$}};
\visible<4->{
\draw [<-] (-3,-3) -- (-2.5,-2) node [pos=0,left,inner sep=2pt] {\footnotesize{预测$\textbf{y}_i$}};}

\visible<5->{
\draw [<-] (2.3,1) -- (3.3,2) node [pos=1,right,inner sep=2pt] {\footnotesize{偏差$|\tilde{\textbf{y}}_i - \textbf{y}_i|$}};
\foreach \x in {-3.8,-3.7,...,3.0}{
    \pgfmathsetmacro{\p}{- 1/14 * (\x + 4) * (\x + 1) * (\x - 1) * (\x - 3)};
    \pgfmathsetmacro{\q}{- 1/14 * (4*\x*\x*\x + 3*\x*\x - 26*\x - 1)};
    \draw [-] (\x,\p) -- (\x, \q);
}
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\vspace{-0.3em}
\begin{itemize}
\item<6-> 这个过程就是\alert{参数优化/训练}，而$\tilde{\textbf{y}}_i$和$\textbf{y}_i$之间偏差的度量就是一种\alert{损失函数}，也称作训练的\alert{目标函数}，而优化的目标就是\textbf{最小化损失函数}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 常见的目标函数
\begin{frame}{常见的损失函数}

\begin{itemize}
\item 损失函数记为$Loss(\tilde{\textbf{y}}_i,\textbf{y}_i)$，简记为$L$，以下是常用的定义
\end{itemize}

\vspace{0.5em}

\footnotesize{
\renewcommand{\arraystretch}{1.2}
\begin{tabular}{l | l | l | l}
名称 & 定义 & NiuTensor实现(\texttt{yh}表示$\tilde{\textbf{y}}_i$) & 应用 \\ \hline
0-1 & $L = \left\{ \begin{array}{ll} 0 & \tilde{\textbf{y}}_i = \textbf{y}_i \\ 1 & \tilde{\textbf{y}}_i \ne \textbf{y}_i \end{array} \right.$ & \scriptsize{\texttt{L = Sign(Absolute(yh - y))}} & 感知机 \\
Hinge & $L=\max(0,1-\tilde{\textbf{y}}_i \cdot \textbf{y}_i)$ & \scriptsize{\texttt{L = Max(0, 1 - yh * y))}} & SVM \\
绝对值 & $L=|\tilde{\textbf{y}}_i - \textbf{y}_i|$ & \scriptsize{\texttt{L = Absolute(yh - y)}} & 回归 \\
Logistic & $L=\log(1 + \tilde{\textbf{y}}_i \cdot \textbf{y}_i)$ & \scriptsize{\texttt{L = Log(1 + yh * y)}} & 回归 \\
平方 & $L=(\tilde{\textbf{y}}_i - \textbf{y}_i)^2$ & \scriptsize{\texttt{L = Power(yh - y, 2)}} & 回归 \\
指数 & $L=\exp(- \tilde{\textbf{y}}_i \cdot \textbf{y}_i) $ & \scriptsize{\texttt{L = Exp(Negate(yh * y))}} & \scriptsize{AdaBoost} \\
交叉熵 & $L=-\sum_k \textbf{y}_i^{[k]} \log \tilde{\textbf{y}}_i^{[k]} $ & \scriptsize{\texttt{L = CrossEntropy(y, yh)}} & 多分类 \\
       & \scriptsize{$\textbf{y}_i^{[k]}$: $\textbf{y}_i$的第$k$维} & & \\
\end{tabular}
\renewcommand{\arraystretch}{1.0}
}

\vspace{-0.5em}
\begin{itemize}
\item 注意：
    \begin{itemize}
    \item 损失函数可以根据问题不同进行选择，没有固定要求
    \item 有些损失函数对网络输出有约束，比如交叉熵要求$\tilde{\textbf{y}}_i$和$\textbf{y}_i$都是概率分布
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 优化目标函数
\begin{frame}{参数优化}

\begin{itemize}
\item 对于第$i$个输入样本($\textbf{x}_i$,$\tilde{\textbf{y}}_i$)，如果把损失函数看做参数$\textbf{w}$的函数（把$\textbf{b}$也作为一种$\textbf{w}$），记为$L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})$，则参数学习可以被描述为：\\

\begin{displaymath}
\hat{\textbf{w}} = \argmin_{\textbf{w}} \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})
\end{displaymath}

$\hat{\textbf{w}}$表示在训练集上使得损失的平均值达到最小的参数。$\frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w})$被称作代价函数(cost function)，它是损失函数均值期望的估计。

\vspace{0.5em}

\item<2-> 核心问题：\textbf{求解$\argmin$，即找到代价函数最小值点}
    \begin{itemize}
    \item 这是非常常见的问题，回忆一下第三章的IBM模型，当时使用的是EM算法
    \item 但是这里并不是一个生成模型
    \item 需要一种更加通用的求解方法
    \end{itemize}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降
\begin{frame}{梯度下降（Gradient Descent）}

\begin{itemize}
\item 如果把目标函数看做是参数$\textbf{w}$的函数，记为$J(\textbf{w})$。优化目标是：找到使$J(\textbf{w})$达到最小的$\textbf{w}$
\item 注意，$\textbf{w}$可能包含几亿个实数，不能是SMT中MERT之类的调参方法。这里可以考虑一种更加适合大量实数参数的优化方法，其核心思想是\alert{梯度下降}：
    \begin{itemize}
    \item<2-> 如果$J(\textbf{w})$对于$\textbf{w}$可微分，$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$表示$J$在$\textbf{w}$处变化最大的方向
    \item<2-> $\textbf{w}$沿着梯度方向更新，新的$\textbf{w}$可以使函数更接近极值
    \end{itemize}
\end{itemize}

\pgfplotsset{%
  colormap={whitered}{color(-1cm)=(orange!75!red);color(1cm)=(white)}
}


\begin{center}
\begin{tikzpicture}[
  declare function = {mu1=1;},
  declare function = {mu2=2;},
  declare function = {sigma1=0.5;},
  declare function = {sigma2=1;},
  declare function = {normal(\m,\s)=1/(2*\s*sqrt(pi))*exp(-(x-\m)^2/(2*\s^2));},
  declare function = {bivar(\ma,\sa,\mb,\sb)=1/(2*pi*\sa*\sb) * exp(-((x-\ma)^2/\sa^2 + (y-\mb)^2/\sb^2))/2;}]
  \footnotesize{
  \visible<2->{
  \begin{scope}
  \begin{axis}[
    colormap name  = whitered,
    width          = 8cm,
    height         = 5cm,
    view           = {20}{45},
    enlargelimits  = false,
    grid           = major,
    domain         = -1:3,
    y domain       = 0:4,
    samples        = 30,
    xlabel         = $\textbf{w}^{[1]}$,
    ylabel         = $\textbf{w}^{[2]}$,
    xlabel style   = {xshift=0em,yshift=0.8em},
    ylabel style   = {xshift=0.2em,yshift=0.8em},
    zlabel         = {$J(\textbf{w})$},
    ztick          = {-0.1},
    colorbar,
    colorbar style = {
      at     = {(1.2,0.5)},
      anchor = north west,
      ytick  = {0,-0.1},
      height = 0.25*\pgfkeysvalueof{/pgfplots/parent axis height},
      title  = {}
    }
  ]

    \addplot3 [surf] {-bivar(mu1,sigma1,mu2,sigma2)};

    \node [circle,fill=red,minimum size=3pt,inner sep=1.5pt] () at (axis cs:0.5,2,-0.01) {};

    \draw [->,very thick,ublue] (axis cs:0.5,2,-0.01) -- (axis cs:0.8,1.6,-0.03) node [pos=1,right,inner sep=2pt] {\tiny{-$\frac{\partial J(\textbf{w})}{\partial \textbf{w}}$}};
    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,1.5,-0.03);
    \draw [->,very thick,dotted] (axis cs:0.5,2,-0.01) -- (axis cs:0.2,3.5,-0.03);
    %\draw [black!50] (axis cs:0,-1,0) -- (axis cs:0,4,0);

  \end{axis}
  \end{scope}
  }
  }
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降的变种
\begin{frame}{梯度下降的不同实现方式}

\begin{itemize}
\item \textbf{梯度下降}：我们可以沿着梯度方向更新$\textbf{w}$一小步，之后得到更好的$\textbf{w}$，之后重新计算梯度，不断重复上述过程

\begin{displaymath}
\textbf{w}_{t+1} = \textbf{w}_t - \alpha \cdot \frac{\partial J(\textbf{w}_t)}{\partial \textbf{w}_t}
\end{displaymath}

其中$t$表示更新的步数，$\alpha$是一个参数，表示更新步幅的大小。$\alpha$的设置需要根据任务进行调整。而$J(\textbf{w}_t)$的形式决定了具体的算法具体的实现。

\item<2-> \textbf{批量梯度下降(Batch Gradient Descent)}：

\begin{displaymath}
J(\textbf{w}_t) = \frac{1}{n} \sum_{i=1}^{n} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
\end{displaymath}

这种方法训练稳定，但是由于每次更新需要对所有训练样本进行遍历，效率低（比如$n$很大），大规模数据上很少使用

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 梯度下降的变种
\begin{frame}{梯度下降的不同实现方式(续)}

\begin{itemize}
\item \textbf{随机梯度下降(Stochastic Gradient Descent)}：

\begin{displaymath}
J(\textbf{w}_t) = L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
\end{displaymath}

大名鼎鼎的SGD，所有机器学习的课程里几乎都有介绍。每次随机选取一个样本进行梯度计算和参数更新，更新的计算代价低，而且适用于利用少量样本进行在线学习(online learning)，不过方法收敛慢


\vspace{0.3em}

\item<2-> \textbf{小批量梯度下降(Mini-batch Gradient Descent)}：

\begin{displaymath}
J(\textbf{w}_t) = \frac{1}{m} \sum_{i=j}^{j+m-1} L(\textbf{x}_i,\tilde{\textbf{y}}_i;\textbf{w}_t)
\end{displaymath}

每次随机使用若干样本进行参数更新(数量不会特别大)，算是一种折中方案，当今最常用的方法之一

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 关于梯度下降的改进
\begin{frame}{一些改进}

\begin{itemize}
\item \textbf{变种和改进}：提高基于梯度的方法的收敛速度、训练稳定性等，可以google一下
    \begin{itemize}
    \item Momentum, Adagrad, Adadelta, RMSprop, Adam, AdaMax, Nadam, AMSGrad等等
    \item \footnotesize{\url{http://ruder.io/optimizing-gradient-descent}}
    \end{itemize}
\item<2-> \textbf{并行化}：大规模数据处理需要分布式计算，梯度更新的策略需要设计
    \begin{itemize}
    \item \textbf{同步更新}：所有计算节点完成计算后，统一汇总并更新参数。效果稳定，但是并行度低
    \item \textbf{异步更新}：每个节点可以随时更新。并行度高，但是由于节点间参数可能不同步，方法不十分稳定
    \end{itemize}
\item<3-> \textbf{其它}
    \begin{itemize}
    \item 深度网络梯度消失和爆炸的问题，使用梯度裁剪、残差连接等
    \item 引入正则化因子，可以对外部知识建模，比如引入噪声让训练更稳定
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何计算梯度
\begin{frame}{如何计算梯度? - 数值微分}

\begin{itemize}
\item \textbf{还有一个核心问题}：如何计算梯度
\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = ?
\end{displaymath}

\vspace{0.5em}

\item<2-> \textbf{数值微分} - 简单粗暴的方法
\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = \lim_{\Delta \textbf{w} \to 0} \frac{L(\textbf{w} + \Delta \textbf{w}) - L(\textbf{w} - \Delta \textbf{w}) }{2\Delta \textbf{w}}
\end{displaymath}

最基本的微分公式，我们可以将$\textbf{w}$变化一点儿（用$\Delta \textbf{w}$表示），之后看$L(\cdot)$的变化

    \begin{itemize}
    \item<3-> \textbf{优点很明显}：方法真的非常简单，易于实现
    \item<3-> \textbf{缺点也很明显}：效率太低，对于复杂网络、参数量稍微大一些的模型基本上无法使用
    \end{itemize}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 如何计算梯度 - 符号微分
\begin{frame}{如何计算梯度? - 符号微分}

\begin{itemize}
\item \textbf{符号微分}：类似于手写出微分表达式，最后带入变量的值，得到微分结果。比如，对于如下表达式
\begin{displaymath}
L(\textbf{w}) = \textbf{x} \cdot \textbf{w} + 2 \textbf{w}^2
\end{displaymath}

\visible<2->{
\vspace{0.5em}
可以手动推导出微分表达式

\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} = \textbf{x} + 4 \textbf{w}
\end{displaymath}
}

\visible<3->{
\vspace{0.5em}
最后，带入$\textbf{x} = \begin{pmatrix} 2 \\ -3 \end{pmatrix}$和$\textbf{w} = \begin{pmatrix} -1 \\ 1 \end{pmatrix}$，得到微分结果\\

\vspace{1em}

\begin{displaymath}
\frac{\partial L(\textbf{w})}{\partial \textbf{w}} =  \begin{pmatrix} 2 \\ -3 \end{pmatrix} + 4 \begin{pmatrix} -1 \\ 1 \end{pmatrix} = \begin{pmatrix} -2 \\ 1 \end{pmatrix}
\end{displaymath}
}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 符号微分的膨胀问题
\begin{frame}{符号微分的膨胀问题}

\begin{itemize}
\item \textbf{Expression Swell}：深层函数的微分表达式会非常复杂
	\begin{itemize}
	\item 表达式冗长不易存储和管理
	\item 真正需要的是微分的\alert{结果值}，而不是微分表达式
	\end{itemize}
\end{itemize}

\vspace{0.5em}

{\small
\begin{tabular} {l | l | l}
函数 & 微分表达式 & 化简的微分表达式 \\ \hline
$x$ & $1$ & $1$ \\ \hline
$x(x+1)$ & $(x+1)+x$ & $2x + 1$ \\ \hline
$x(x+1)$ & $(x+1)(x^2+x+1)$ & $4x^3+6x^2$ \\
$(x^2+x+1)$ & $+x(x^2+x+1)$ & $+4x+1$ \\
                     & $+x(x+1)(2x+1)$ & \\ \hline
$(x^2+x)$ & $(2x+1)(x^2+x+1)$ & $8x^7+28x^6$ \\
$(x^2+x+1)$ & $(x^4+2x^3+2x^2+x+1)$ & $+48x^5+50x^4$ \\
$(x^4+2x^3$ & $+(2x+1)(x^2+x)$ & $+36x^3+18x^2$ \\
$+2x^2+x+1)$ & \ \ $(x^4+2x^3+2x^2+x+1)$ & $+6x+1$ \\
 & $+(x^2+x)(x^2+x+1)$ & \\
 & \ \ $(4x^3+6x^2+4x+1)$ & \\


\end{tabular}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 自动微分
\begin{frame}{如何计算梯度? - 自动微分}

\begin{itemize}
\item \textbf{自动微分}：复杂的微分变成简单的步骤，这些步骤完全自动化，而且容易进行存储、计算。这可以用一种反向模式进行描述（也就是\alert{反向传播}思想），包括两步
	\begin{enumerate}
	\item \textbf{前向计算}：从神经网络的输入，逐层计算每层网络的输出值，这也是神经网络的标准使用方式
	\item \textbf{反向计算}：从神经网络的输出，逆向逐层计算每层网络输入（输出）所对应的微分
	\end{enumerate}
\end{itemize}

\visible<2->{
\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{layernode} = [draw,thick,fill=ugreen!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}];

\node [anchor=center,layernode,minimum height=4em,minimum width=1em] (layer01) at (0,0) {};
\node [anchor=north west,layernode,minimum height=3em,minimum width=1em] (layer02) at ([xshift=3em]layer01.north east) {};
\node [anchor=south west,layernode,minimum height=3em,minimum width=1em] (layer03) at ([xshift=7em]layer01.south east) {};
\node [anchor=south west,layernode,minimum height=4em,minimum width=1em] (layer04) at ([xshift=11em]layer01.south east) {};
\node [anchor=south west,layernode,minimum height=4em,minimum width=1em] (layer05) at ([xshift=3em]layer04.south east) {};

\node [anchor=east] (input) at ([xshift=-1em]layer01.west){\scriptsize{输入}};
\node [anchor=west] (output) at ([xshift=1em]layer05.east){\scriptsize{输出}};

\draw [->] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west);
\draw [->] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west);
\draw [->] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west);
\draw [->] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west);
\draw [->] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west);
\draw [->] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west);
\draw [->] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east);

\visible<3->{
\draw [->,very thick,ublue] ([xshift=-1em]layer01.west) -- ([xshift=-0.1em]layer01.west);
}
\visible<4->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer02.north west);
}
\visible<5->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer01.south east) -- ([xshift=-0.1em,yshift=0.5em]layer03.south west);
}
\visible<6->{
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=-0.5em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.5em]layer04.north west);
\draw [->,very thick,ublue] ([xshift=0.1em,yshift=0.5em]layer03.south east) -- ([xshift=-0.1em,yshift=0.5em]layer04.south west);
\draw [->,very thick,ublue] ([xshift=0.1em]layer04.east) -- ([xshift=-0.1em]layer05.west);
\draw [->,very thick,ublue] ([xshift=0.1em]layer05.east) -- ([xshift=1.0em]layer05.east);
}

\visible<8->{
\draw [<-,very thick,red] ([xshift=-1em,yshift=-0.3em]layer01.west) -- ([xshift=-0.1em,yshift=-0.3em]layer01.west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer01.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer02.north west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer01.south east) -- ([xshift=-0.1em,yshift=0.2em]layer03.south west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.8em]layer02.north east) -- ([xshift=-0.1em,yshift=-0.8em]layer04.north west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=0.2em]layer03.south east) -- ([xshift=-0.1em,yshift=0.2em]layer04.south west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer04.east) -- ([xshift=-0.1em,yshift=-0.3em]layer05.west);
\draw [<-,very thick,red] ([xshift=0.1em,yshift=-0.3em]layer05.east) -- ([xshift=1.0em,yshift=-0.3em]layer05.east);
}

\visible<7->{
\draw [<-,thin] ([xshift=0.3em,yshift=0.3em]layer04.east) .. controls +(35:1) and +(215:1) .. ([xshift=-2em,yshift=0.3em]layer05.north west) node [pos=1,above] {\scriptsize{前向：层$i$ 的输出$h_{i}$}};
}
\visible<9->{
\draw [<-,thin] ([xshift=0.3em,yshift=-0.7em]layer04.east) .. controls +(-35:1) and +(145:1) .. ([xshift=-2em,yshift=-0.3em]layer05.south west) node [pos=1,below] {\scriptsize{反向：$h_{i}$ 处的梯度$\frac{\partial L}{\partial h_i}$}};
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\vspace{-1em}
\begin{itemize}
\item<10-> 自动微分可以用\alert{计算图}实现(TensorFlow、 NiuTensor 等)，不过计算图超出了课程的范围，建议大家自行学习
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 反向传播 - 符号说明
\begin{frame}{符号说明}

\begin{itemize}
\item 以一个$K$层神经网络为例重新明确一下符号
    \begin{itemize}
    \item 这里假设每层神经网络中都不含偏置项（不含$\textbf{b}$）
    \end{itemize}
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\def\neuronsep{1}
\tikzstyle{neuronnode} = [minimum size=1.2em,circle,draw,ublue,very thick,inner sep=1pt, fill=white,align=center,drop shadow={shadow xshift=0.1em,shadow yshift=-0.1em}];

%%% layer 1
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron0\n) at (\n * \neuronsep,0) {};
    \draw [->] ([yshift=-0.8em]neuron0\n.south) -- ([yshift=-0.1em]neuron0\n.south) node [pos=0,below] {\tiny{...}};
}


\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=red!20] [fit = (neuron01) (neuron04)] (layer01) {};
\node [anchor=east] (layer01label) at (layer01.west) {\scriptsize{层$k-1$}};
\end{pgfonlayer}

%%% layer 2
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron1\n) at (\n * \neuronsep,3em) {};
}

\foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
        \draw [<-] (neuron1\n.south) -- (neuron0\m.north);
    }
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron11) (neuron14)] (layer02) {};
\node [anchor=east] (layer02label) at (layer02.west) {\scriptsize{层$k$}};
\end{pgfonlayer}

%%% layer 3
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron2\n) at (\n * \neuronsep,6em) {};
    \draw [<-] ([yshift=0.8em]neuron2\n.north) -- ([yshift=0.0em]neuron2\n.north) node [pos=0,above] {\tiny{...}};
}

\foreach \n in {1,...,4}{
    \foreach \m in {1,...,4}{
        \draw [<-] (neuron2\n.south) -- (neuron1\m.north);
    }
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=blue!20] [fit = (neuron21) (neuron24)] (layer03) {};
\node [anchor=east] (layer03label) at (layer03.west) {\scriptsize{层$k+1$}};
\end{pgfonlayer}

%%% output layer
\foreach \n in {1,...,4}{
    \node [neuronnode] (neuron3\n) at (\n * \neuronsep,9.4em) {};
    \visible<1-3,5->{
    \draw [<-] ([yshift=0.6em]neuron3\n.north) -- ([yshift=0.0em]neuron3\n.north) node [pos=0,above] {\tiny{output}};
    }
    \visible<4>{
    \draw [<-,red,very thick] ([yshift=0.6em]neuron3\n.north) -- ([yshift=0.0em]neuron3\n.north) node [pos=0,above] {\tiny{output}};
    }
    \draw [->] ([yshift=-0.6em]neuron3\n.south) -- ([yshift=0.0em]neuron3\n.south);
}

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.2em,fill=ugreen!20] [fit = (neuron31) (neuron34)] (layer04) {};
\node [anchor=east] (layer04label) at (layer04.west) {\scriptsize{层$K$(输出)}};
\end{pgfonlayer}

\visible<2->{
\node [neuronnode,draw=red,fill=red!20!white,inner sep=1pt] (neuron12new) at (2 * \neuronsep,3em) {};
\node [anchor=east] (neuronsamplelabel) at ([yshift=-1em]layer02label.south east) {\alert{\textbf{\tiny{第$k$层, 第$i$个神经元}}}};
\draw [->,dashed,very thick,red] ([xshift=-0.2em,yshift=0.2em]neuronsamplelabel.east) .. controls +(30:1) and +(220:1) .. ([xshift=-0em,yshift=-0em]neuron12new.210);
}

\visible<3>{
\foreach \n in {1,...,4}{
\draw [<-,thick,red] (neuron2\n.south) -- (neuron12.north);
}
}

\visible<5->{
\draw [<-,thick,red] (neuron14.south) -- (neuron04.north);
\node [anchor=north] (wlabel) at (layer02.south east) {\alert{\scriptsize{$w_{4,4}^{k}$}}};
}

\visible<3->{
\node [anchor=west,align=left] (line01) at ([xshift=1em,yshift=1em]layer04.east) {\footnotesize{$h_{i}^{k}$：第$k$层, 第$i$个神经元的输出}};
\node [anchor=north west,align=left] (line02) at (line01.south west) {\footnotesize{$\textbf{h}^{k}$：第$k$层的输出}};
\node [anchor=north west,align=left] (line03) at (line02.south west) {\footnotesize{$\textbf{s}^{k}$：第$k$层的线性变换$\textbf{s}^k=\textbf{h}^{k-1}\textbf{w}^k$}};
\node [anchor=north west,align=left] (line04) at (line03.south west) {\footnotesize{$f^{k}$：第$k$层的激活函数$\textbf{h}^k=f^k(\textbf{s}^k)$}};
}
\visible<4->{
\node [anchor=north west,align=left] (line05) at (line04.south west) {\footnotesize{$\textbf{h}^{K}$：网络最后的输出}};
}
\visible<5->{
\node [anchor=north west,align=left] (line06) at (line05.south west) {\footnotesize{$w_{j,i}^{k}$：第$k-1$层神经元$j$与}\\\footnotesize{第$k$层神经元$i$的连接权重}};
\node [anchor=north west,align=left] (line07) at (line06.south west) {\footnotesize{$\textbf{w}^{k}$：第$k-1$层与第$k$层的}\\\footnotesize{连接权重}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-1.5em}

\visible<6->{
\begin{displaymath} \textrm{对于第}k\textrm{层}: \textbf{h}^k = f^k(\textbf{s}^k) = f^k(\sum_j h_{j}^{k-1}w_{j,i}^k) = f^k(\textbf{h}^{k-1} \textbf{w}^k) \end{displaymath}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播
\begin{frame}{反向传播 - 输出层}

\begin{itemize}
\item 输出层(两个阶段)
\end{itemize}

\vspace{-1.0em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};

\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};
\node [anchor=south west,inner sep=2pt] (step101) at (step100.north west) {\tiny{线性变换}};

\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};
\node [anchor=south west,inner sep=2pt] (step201) at (step200.north west) {\tiny{激活函数}};

\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};

\visible<2->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] (h.south west) -- (s.south west) node [pos=0.5,below,yshift=-1em] {\scriptsize{\textbf{第一阶段：线性变换}}};
}
\visible<3->{
\draw[decorate,thick,decoration={brace,mirror,raise=0.4em,amplitude=2mm}] ([xshift=0.2em]s.south west) -- (l.south east) node [pos=0.5,below,yshift=-1em] (step2) {\scriptsize{\textbf{第二阶段：激活函数+损失函数}}};
}

\visible<4->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{反向求梯度\alert{$\frac{\partial L}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}

\end{scope}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<4-> 反向传播从输出向输入传播梯度，因此我们先考虑阶段二\visible<5->{。令$\pi^k = \frac{\partial L}{\partial \textbf{s}^k}$表示损失$L$在第$k$层激活函数输入处的梯度，利用链式法有}

\vspace{-1.5em}
\visible<5->{
\begin{eqnarray}
\pi^K & = & \frac{\partial L}{\partial \textbf{s}^K} \nonumber \\
         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{s}^K}  \nonumber \\
         & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}  \nonumber
%\frac{\partial L}{\partial \textbf{w}^K} & = & \frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial \textbf{h}^K}{\partial \textbf{w}^K} \nonumber \\
%                                                         & \visible<4->{=} & \visible<4->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{h}^{K-1} \textbf{w}^K)}{\partial \textbf{w}^K} \ \ \ \ (\textrm{因为}\textbf{h}^K=f^K(\textbf{h}^{K-1} \textbf{w}^K))} \nonumber \\
%                                                         & \visible<5->{=} & \visible<5->{\frac{\partial L}{\partial \textbf{h}^K} \cdot \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} \cdot \frac{\partial \textbf{s}^K}{\partial \textbf{w}^K}  \ \ \ (\textrm{因为}\textbf{s}^K=\textbf{h}^{K-1} \textbf{w}^K)} \nonumber
\end{eqnarray}
}

\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 各个因子的意义
\begin{frame}{反向传播 - 输出层($\textbf{s}^K$处的梯度)}
\begin{center}
\begin{tikzpicture}

\begin{scope}
\node [anchor=center] (factor00) at (0,0) {${\displaystyle \pi^K \ = }$};
\node [anchor=west] (factor01) at (factor00.east) {${\displaystyle \frac{\partial L}{\partial \textbf{h}^K}}$};
\node [anchor=west,inner sep=1pt] (factor02) at (factor01.east) {${\displaystyle \cdot}$};
\node [anchor=west] (factor03) at (factor02.east) {${\displaystyle \frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}}$};

\begin{pgfonlayer}{background}
\visible<2-4>{
\node [rectangle,inner sep=0em,fill=red!20] [fit = (factor01)] (p1) {};
}
\visible<3-4>{
\node [rectangle,inner sep=0em,fill=blue!20] [fit = (factor03)] (p2) {};
}
\visible<5->{
\node [circle,inner sep=0em,fill=green!20] [fit = (factor02)] (p3) {};
}
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\begin{itemize}
\item<2-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=red!20] (factor01) at (factor00.east) {$\frac{\partial L}{\partial \textbf{h}^K}$};}} 表示损失$L$相对网络输出的变化率，比如，对于$L = \frac{1}{2} ||\tilde{\textbf{y}} - \textbf{h}^K||^2$，有$\frac{\partial L}{\partial \textbf{h}^K} = \tilde{\textbf{y}} - \textbf{h}^K$
\item<3-> \raisebox{-0.7em}{\tikz{\node [anchor=west,fill=blue!20] (factor01) at (factor00.east) {$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$};}} 表示激活函数相对于它自己的输入的变化率，比如，对于$f(\textbf{s}) = \frac{1}{1+\exp(-\textbf{s})}$，有$\frac{\partial f(\textbf{s})}{\partial \textbf{s}} = f(\textbf{s})(1-f(\textbf{s}))$
\item<4-> 这个结果符合直觉，在$s^K$出的梯度相当于在损失函数微分($\frac{\partial L}{\partial \textbf{h}^K}$)和激活函数微分($\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K}$) 的乘积\visible<5->{，注意这里所有操作都是单元级，比如张量按单元乘法}

\end{itemize}

\visible<4->{
\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at (0,0) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west);

\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]l.north) -- ([yshift=1em,xshift=0.1em]h2.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial L}{\partial \textbf{h}^K} = ?$}}};
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]h2.north) -- ([yshift=1em,xshift=0.1em]s.north) node [pos=0.5,above] {\tiny{求梯度\alert{$\frac{\partial f^K(\textbf{s}^K)}{\partial \textbf{s}^K} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]l.north) -- ([yshift=1.5em]l.north);
\draw [-,very thick,red] ([yshift=0.5em]h2.north) -- ([yshift=1.5em]h2.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);

\end{scope}

\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 输出层的反向传播 - 求 dL/dw
\begin{frame}{反向传播 - 输出层($\textbf{h}^{K-1}$处的梯度)}

\begin{itemize}
\item 已经得到$\textbf{s}^K$处的梯度\visible<2->{，下面求解两个问题}
	\begin{enumerate}
	\item<2-> 计算损失$L$对于第$K$层参数矩阵$\textbf{w}^K$的梯度，$\frac{\partial L}{\partial \textbf{w}^K}$
	\item<2-> 计算损失$L$对于第$K$层输入$\textbf{h}^{K-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{K-1}}$
	\end{enumerate}
\end{itemize}

\vspace{-0.8em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,minimum height=1.7em,fill=yellow!20,draw] (h) at (0,0) {$\textbf{h}^{K-1}$};
\node [anchor=west,minimum height=1.7em,fill=blue!20,draw] (s) at ([xshift=5.5em]h.east) {$\textbf{s}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=green!20,draw] (h2) at ([xshift=5.5em]s.east) {$\textbf{h}^{K}$};
\node [anchor=west,minimum height=1.7em,fill=orange!20,draw] (l) at ([xshift=5.5em]h2.east) {$L$};
\draw [->] (h.east) -- (s.west);
\draw [->] (s.east) -- (h2.west);
\draw [->] (h2.east) -- (l.west) node [pos=0.5,above] {\tiny{损失}};

\node [anchor=south west,inner sep=2pt] (step100) at ([xshift=0.5em,yshift=-0.8em]h.north east) {\tiny{$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$}};

\node [anchor=south west,inner sep=2pt] (step200) at ([xshift=0.5em,yshift=-0.8em]s.north east) {\tiny{$\textbf{h}^K = f^K(\textbf{s}^K)$}};

\node [anchor=south,inner sep=1pt] (outputlabel) at ([yshift=0.0em]h2.north) {\tiny{\textbf{输出层}}};
\node [anchor=south west] (slabel) at ([yshift=1em,xshift=0.3em]s.north) {\scriptsize{\textbf{\alert{已经得到：$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$}}}};
\draw [->,red] ([yshift=0.3em]slabel.south) .. controls +(south:0.5) and +(north:0.5) .. ([xshift=0.5em]s.north);

\visible<2->{
\draw [->,very thick,red] ([yshift=1em,xshift=-0.1em]s.north) -- ([yshift=1em,xshift=0.1em]h.north) node [pos=0.5,above] {\tiny{\alert{$\frac{\partial L}{\partial \textbf{w}^K} = ?$, $\frac{\partial L}{\partial \textbf{h}^{K-1}} = ?$}}};
\draw [-,very thick,red] ([yshift=0.5em]h.north) -- ([yshift=1.5em]h.north);
\draw [-,very thick,red] ([yshift=0.5em]s.north) -- ([yshift=1.5em]s.north);
}

\end{scope}

\end{tikzpicture}
\end{center}

\begin{itemize}
\item<3-> 由于$\textbf{s}^K = \textbf{h}^{K-1} \textbf{w}^K$，而且$\pi^K = \frac{\partial L}{\partial \textbf{s}^K}$已经求解，可以得到(需要一些数学分析和线性代数的知识，推导一下!)：

\vspace{-1.2em}

\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^K}      & = & [\textbf{h}^{K-1}]^T \pi^K \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{K-1}} & = & \pi^K  [\textbf{w}^K]^T\nonumber
\end{eqnarray}

这里，$[\textbf{A}]^T$表示$\textbf{A}$的转置，$\pi^K  [\textbf{w}^K]^T$表示张量$\pi^K$\alert{矩阵乘}$\textbf{w}^K$的转置
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 隐层的反向传播
\begin{frame}{反向传播 - 隐层}
\begin{itemize}
\item 对于任意隐层$k$，$\textbf{h}^k = f^k(\textbf{s}^k) = f^k(\textbf{h}^{k-1}\textbf{w}^k)$。给定：隐层输出处的梯度$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}}$，需要
	\begin{enumerate}
	\item 计算损失$L$对于第$k$层参数矩阵$\textbf{w}^k$的梯度，$\frac{\partial L}{\partial \textbf{w}^k}$
	\item 计算损失$L$对于第$k$层输入$\textbf{h}^{k-1}$的梯度，$\frac{\partial L}{\partial \textbf{h}^{k-1}}$
	\end{enumerate}
\item<2-> 直接套用上一页的方法，可以将$\pi^k=\frac{\partial L}{\partial \textbf{h}^{k}} \frac{\partial f^k(\textbf{s}^k)}{\partial \textbf{s}^{k}}$反向传播
\vspace{-0.0em}
\begin{eqnarray}
\frac{\partial L}{\partial \textbf{w}^k}      & = & [\textbf{h}^{k-1}]^T \pi^k \nonumber \\
\frac{\partial L}{\partial \textbf{h}^{k-1}} & = & \pi^k  [\textbf{w}^k]^T\nonumber
\end{eqnarray}

\end{itemize}

\visible<3->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,draw,fill=red!20,minimum height=1.8em,minimum width=2.5em] (h) at (0,0) {$\textbf{h}^{k-1}$};
\node [anchor=west,draw,fill=blue!20,minimum height=1.8em,minimum width=2.5em] (s) at ([xshift=6em]h.east) {$\textbf{s}^{k}$};
\node [anchor=west,draw,fill=green!20,minimum height=1.8em,minimum width=2.5em] (h2) at ([xshift=6em]s.east) {$\textbf{h}^{k}$};
\node [anchor=east] (prev) at ([xshift=-2em]h.west) {...};
\node [anchor=west] (next) at ([xshift=2em]h2.east) {...};
\draw [->,thick] ([xshift=0.1em]prev.east) -- ([xshift=-0.1em]h.west);
\draw [->,thick] ([xshift=0.1em]h.east) -- ([xshift=-0.1em]s.west) node [pos=0.5,below] {\tiny{$\textbf{s}^k = \textbf{h}^{k-1}\textbf{w}^k$}};
\draw [->,thick] ([xshift=0.1em]s.east) -- ([xshift=-0.1em]h2.west) node [pos=0.5,below] {\tiny{$\textbf{h}^k = f(\textbf{s}^{k})$}};
\draw [->,thick] ([xshift=0.1em]h2.east) -- ([xshift=-0.1em]next.west);

\visible<4->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h2.east) -- ([xshift=-0.1em,yshift=0.4em]next.west) node [pos=0.8,above] {\tiny{反向传播}};
}

\visible<5->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]s.east) -- ([xshift=-0.1em,yshift=0.4em]h2.west) node [pos=0.5,above] {\tiny{反向传播}};
}

\visible<6->{
\draw [<-,thick,red] ([xshift=0.1em,yshift=0.4em]h.east) -- ([xshift=-0.1em,yshift=0.4em]s.west) node [pos=0.5,above] {\tiny{反向传播}};
}

\visible<7->{
\draw [->,thick,red,dashed] ([yshift=-0.1em]h.south) -- ([yshift=-1em]h.south) -- ([yshift=-1em]h2.south) -- ([yshift=-0.1em]h2.south);
\node [anchor=north,red] (recur) at ([yshift=-1em]s.south) {\scriptsize{$k=k-1$重复上述过程}};
}

\visible<4->{
\node [anchor=south] (h2label) at (h2.north) {$\frac{\partial L}{\partial \textbf{h}^{k}}$};
}

\visible<5->{
\node [anchor=south] (slabel) at (s.north) {$\pi^k = \frac{\partial L}{\partial \textbf{s}^{k}}$};
}

\visible<6->{
\node [anchor=south] (hlabel) at (h.north) {$\frac{\partial L}{\partial \textbf{h}^{k-1}}$, $\frac{\partial L}{\partial \textbf{w}^{k}}$};
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 反向传播实例
\begin{frame}{反向传播的实现}
\begin{itemize}
\item 对于一个多层神经网络很容易实现反向传播
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, y, gold, h[5], w[5], s[5];} \\
\texttt{XTensor dh[5], dw[5], ds[5];} \\
\texttt{...} // 前向过程 \\
\texttt{h[0] = x;} \\
\texttt{y = h[4];} \\

\visible<2->{
\texttt{} \\
\texttt{CrossEntropyBackward(dh[4], y, gold);} \\
\texttt{SoftmaxBackward(y, s[4], dh[4], ds[4]);}\\
\texttt{MMul(h[3], {\tiny X\_TRANS}, ds[4], {\tiny X\_NOTRANS}, dw[4]);}\\
\texttt{MMul(ds[4], {\tiny X\_NOTRANS}, w[4], {\tiny X\_RANS}, dh[3]);}\\
}

\visible<3->{
\texttt{} \\
\texttt{dh[2] = dh[3];}\\
\texttt{ReluBackward(h[2], s[2], dh[2], ds[2]);}\\
\texttt{MMul(h[1], {\tiny X\_TRANS}, ds[2], {\tiny X\_NOTRANS}, dw[2]);}\\
\texttt{MMul(ds[2], {\tiny X\_NOTRANS}, w[2], {\tiny X\_TRANS}, dh[2]);}\\
}

\visible<4->{
\texttt{} \\
\texttt{dh[1] = dh[1] + dh[3];}\\
}

\visible<5->{
\texttt{...} // 继续反向传播 \\
\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{dw[i]}}访问参数的梯度\\
\texttt{\}}
}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}


\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.5em]h1.north) {\tiny{h1 = Relu(x * w1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.5em]h2.north) {\tiny{h2 = Relu(h1 * w2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.2em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.5em]h3.north) {\tiny{h3 = h2 + h1}};

\visible<1-4>{\draw [->,thick] (h1.north) -- (h2.south);}
\visible<1-2>{\draw [->,thick] (h2.north) -- (h3.south);}
\visible<1-2>{\draw [->,thick] (h3.north) -- (h4.south);}
\visible<1-3>{\draw [->,thick,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}

\visible<5>{\draw [<-,very thick,red] (h1.north) -- (h2.south);}
\visible<3->{\draw [<-,very thick,red] (h2.north) -- (h3.south);}
\visible<3->{\draw [<-,very thick,red] (h3.north) -- (h4.south);}
\visible<4->{\draw [<-,very thick,red,rounded corners] (h2.east) -- ([xshift=0.5em]h2.east) -- ([xshift=0.5em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=0.5em]h3.north east) -- ([xshift=-2em,yshift=1.5em]h3.north east);}

\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.2em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.5em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

\visible<1>{\draw [->,thick] (h4.north) -- (slayer.south);}
\visible<2->{\draw [<-,very thick,red] (h4.north) -- (slayer.south);}

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 自动微分的实现
\begin{frame}{更简单的实现}
\begin{itemize}
\item 幸运的是，现在几乎所有的主流深度学习框架都实现了自动微分，一个函数可以搞定
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.5cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor x, loss, gold, h[5], w[5], b[5];} \\
\texttt{...} \\

\texttt{} \\
\texttt{h[1] = Relu(MMul(x, w[1]) + b[1]);} \\
\texttt{h[2] = Relu(MMul(h[1], w[2]) + b[2]);} \\
\texttt{h[3] = HardTanH(h[2]);} \\
\texttt{h[4] = Softmax(MMul(h[3], w[3]));} \\
\texttt{loss = CrossEntropy(h[4], gold);} \\

\texttt{} \\
\texttt{XNet net;}\\
\alert{\texttt{net.Backward(loss);} //一行代码实现自动微分}\\

\texttt{} \\
\texttt{for(unsigned i = 0; i < 5; i++)\{} \\
\texttt{} \ \ \ \ ... // 通过\alert{\texttt{w[i].grad}}访问参数的梯度\\
\texttt{\}}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}


\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h1) at (0,0) {\tiny{x (input)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h2) at ([yshift=1.0em]h1.north) {\tiny{h1 = Relu(x * w1 + b1)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h3) at ([yshift=1.0em]h2.north) {\tiny{h2 = Relu(h1 * w2 + b2)}};
\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8em,minimum height=1.0em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (h4) at ([yshift=1.0em]h3.north) {\tiny{h3 = HardTanh(h2)}};

\draw [->,thick] (h1.north) -- (h2.south);
\draw [->,thick] (h2.north) -- (h3.south);
\draw [->,thick] (h3.north) -- (h4.south);

\node [anchor=south,draw,rounded corners,inner sep=2pt,minimum width=8.0em,minimum height=1.0em,fill=red!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}] (slayer) at ([yshift=1.0em]h4.north) {\tiny{h4 = Softmax(h3 * w4) (output)}};
\node [anchor=south] (losslabel) at (slayer.north) {\scriptsize{\textbf{Cross Entropy Loss}}};

\draw [->,thick] (h4.north) -- (slayer.south);

\end{tikzpicture}
\end{center}
\end{tcolorbox}

\begin{itemize}
\item 其它优秀的自动微分实现也可以参考TensorFlow、 PyTorch等工具
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 前向计算过程及其它值得关注的问题
\begin{frame}{前向计算及其它问题}
\begin{itemize}
\item \alert{前向计算}实际上就是网络构建的过程，有两种常用方式
    \begin{itemize}
    \item \textbf{动态图}(如PyTorch、NiuTensor)：写完函数表达式，前向计算即完成，易于调试
    \item \textbf{静态图}(如TensorFlow)：函数表达式完成后，并不能得到前向计算结果，需要显性调用一个Forward函数，但是计算图可以进行深度优化，执行效率较高
    \end{itemize}
\item<2-> 其它一些深度学习系统实现的问题，值得关注，不过这些都超出了本课程的范围
    \begin{itemize}
    \item \textbf{分布式训练}：对于复杂模型的海量数据训练，需要利用多个设备（多机、多卡）同时训练
    \item \textbf{低精度计算}：为了提高效率可以采用半精度或者定点数进行计算
    \item \textbf{模型压缩}：减少冗余，可以压缩模型，使得模型易于存储同时提高系统运行效率
    \item \textbf{训练方法和超参选择}：不同任务往往需要不同的训练策略，包括超参设置，坑很多，需要积累经验
    \end{itemize}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\section{神经语言模型}

%%%------------------------------------------------------------------------------------------------------------
%%% outline: neural language modeling
\begin{frame}{进入正题}

\vspace{6em}
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=red!5!white,colframe=red!75!black,drop fuzzy shadow]
{\Large
\textbf{如何将神经元网络应用到NLP？}

\vspace{0.4em}
\textbf{- 语言模型的神经网络建模}
}
\end{tcolorbox}

\vspace{2em}
\begin{center}
\begin{tikzpicture}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{前馈、循环、自注意力神经网络}

%%%------------------------------------------------------------------------------------------------------------
%%% 在NLP中神经网络能干什么？
\begin{frame}{自然语言处理遇到神经网络}
\begin{itemize}
\item 神经网络方法给自然语言处理(NLP)带来了新的思路
\end{itemize}

\begin{tabular} {l | l}
\textbf{传统基于统计的方法} & \textbf{深度学习方法} \\ \hline
基于\alert{离散}空间的表示模型 & 基于\alert{连续}空间的表示模型 \\
NLP问题的\alert{隐含结构}假设 & 无隐含结构假设，\alert{端到端}学习 \\
\alert{特征工程}为主 & 无显性特征，但需要\alert{设计网络} \\
特征、规则的\alert{存储耗资源} & 模型存储相对小，但\alert{计算慢}
\end{tabular}

\vspace{0em}

\begin{itemize}
\item<2-> 语言模型任务也可以使用深度学习方法(效果非常好)
    \begin{itemize}
    \item 语言模型要回答的问题是如何评价一个词串的好坏
    \item 可以回忆一下第二章提到的$n$元语法模型
    \end{itemize}
    \vspace{0.5em}
    \begin{displaymath}
    \textbf{P}(w_0 w_1 ... w_m) = ?
    \end{displaymath}

\end{itemize}

\visible<3->{
\begin{tcolorbox}[enhanced,size=normal,left=2mm,right=1mm,colback=blue!5!white,colframe=blue!75!black,drop fuzzy shadow]
{\Large
\textbf{如何对词串的生成概率进行建模？}
}
\end{tcolorbox}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% n-gram语言模型
\begin{frame}{$n$-gram语言模型}
\begin{itemize}
\item \textbf{链式法则}
\begin{eqnarray}
\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
                                               &    & \textrm{P}(w_m|w_1...w_{m-1}) \nonumber
\end{eqnarray}
\item<2-> \textbf{传统$n$-gram语言模型}：当前词仅依赖于前面$n-1$个词
\begin{eqnarray}
\textrm{P}(w_1 w_2 ... w_m)  & = & \textrm{P}(w_1) \textrm{P}(w_2|w_1) \textrm{P}(w_3 | w_1 w_2) ... \nonumber \\
                                               &    & \textrm{P}(w_m|\underbrace{w_{m-n+1}...w_{m-1}}_{\text{前面$n-1$个词}}) \nonumber
\end{eqnarray}
\vspace{-1.0em}
\ \ \ \ \ \ 其中
\begin{displaymath}
\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})  = \frac{\textrm{count}(w_{m-n+1}...w_{m})}{\textrm{count}(w_{m-n+1}...w_{m-1})}
\end{displaymath}
\ \ \ \ \ \ $\textrm{count}(\cdot)$表示在训练数据上统计的频次
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% n-gram lm => neural lm
\begin{frame}{$n$-gram生成概率的神经网络建模}
\begin{itemize}
\item 传统的$n$-gram语言模型实际上就是一个查询表，用$w_{m-n+1} ... w_{m}$查询$n$-gram概率$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$
    \begin{itemize}
    \item 这张表本质上是一种$w_{m-n+1} ... w_{m}$的\alert{离散表示}
    \item 随着$n$的增大，\alert{数据稀疏}问题会非常严重，因为绝大多数$n$-gram是没见过的
    \item 因为要维护$n$-gram的索引，存储消耗大
    \end{itemize}
\item<2-> 另一种思路是直接对$\textrm{P}(w_m | w_{m-n+1} ... w_{m-1})$进行连续空间建模，即定义函数$g$，对于任意的$w_{m-n+1} ... w_{m}$有
    \begin{displaymath}
    g(w_{m-n+1} ... w_{m}) \approx \textrm{P}(w_m | w_{m-n+1} ... w_{m-1})
    \end{displaymath}



\item<3-> 最具代表性的方法是前馈神经网络(FNN)语言模型
    \begin{itemize}
    \item 经典中的经典，对现代神经语言模型的设计产生深远影响
    \end{itemize}

    \textbf{A Neural Probabilistic Language Model}\\
    \textbf{Bengio et al., 2003, Journal of Machine Learning Research 3: 1137-1155}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% FNNLM architecture
\begin{frame}{前馈神经网络语言模型(Bengio et al., 2003)}
\begin{itemize}
\item 以4-gram语言模型为例
\end{itemize}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{i-3}$}};
\node [anchor=west] (w1) at ([xshift=2em]w0.east) {\footnotesize{$w_{i-2}$}};
\node [anchor=west] (w2) at ([xshift=2em]w1.east) {\footnotesize{$w_{i-1}$}};
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\footnotesize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);

\visible<6->{
\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e1.north) -- ([xshift=1em,yshift=-0.1em]h1.south);
\draw [->,dashed,red,thick] ([xshift=-1em,yshift=0.1em]e0.north) .. controls +(north:2) and +(south:1) .. ([xshift=-3em,yshift=-0.1em]h1.south);
\draw [->,dashed,red,thick] ([xshift=1em,yshift=0.1em]e2.north) .. controls +(north:2) and +(south:1) .. ([xshift=3em,yshift=-0.1em]h1.south);
}

\begin{pgfonlayer}{background}
\visible<2->{
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w0) (index0)] (wordbox0) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w1) (index1)] (wordbox1) {};
\node [rectangle,inner sep=0.1em,fill=ugreen!20!white] [fit = (w2) (index2)] (wordbox2) {};
}
\end{pgfonlayer}

\visible<3->{
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{i-1} \textbf{C}$}};
}
\visible<5->{
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h0) at ([yshift=1.5em]e1.north) {\tiny{$\textbf{h}_0=\textrm{Tanh}([\textbf{e}_0,\textbf{e}_1,\textbf{e}_2] \textbf{H} + \textbf{d})$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (h1) at ([yshift=1.5em]h0.north) {\tiny{$\textbf{y}=\textrm{Softmax}(\textbf{h}_0 \textbf{U})$}};
}

\visible<2->{
\node [anchor=north west] (indexlabel0) at ([yshift=-0.5em,xshift=-1.2em]index0.south west) {\scriptsize{{\color{ugreen} \textbf{One-hot表示}}}};
\node [anchor=north west] (indexlabel1) at ([yshift=0.3em]indexlabel0.south west) {\scriptsize{每个词用一个词汇表大小的0-1向量表示，}};
\node [anchor=north west] (indexlabel2) at ([yshift=0.3em]indexlabel1.south west) {\scriptsize{仅一位为1，其余为0，比如：}};
\node [anchor=north west] (indexlabel3) at ([yshift=0.0em]indexlabel2.south west) {\scriptsize{$(0,0,{\red 1},0,0,0,0,0,0,0,0,0)$}};
\node [anchor=north west] (indexlabel4) at ([xshift=1em,yshift=0.0em]indexlabel3.south west) {\scriptsize{词表中第3个词}};
\draw [->] ([xshift=1.2em,yshift=-0.2em]indexlabel4.north west) -- ([xshift=1.2em,yshift=0.3em]indexlabel4.north west);
}

\visible<3->{
\node [anchor=west] (embedinglabel0) at ([xshift=1em,yshift=-1em]e2.east) {\scriptsize{{\blue \textbf{词的分布式表示}}}};
\node [anchor=north west] (embedinglabel1) at ([yshift=0.3em]embedinglabel0.south west) {\scriptsize{词的0-1表示乘一个矩阵$\textbf{C}$，这里可以}};
\node [anchor=north west] (embedinglabel2) at ([yshift=0.3em]embedinglabel1.south west) {\scriptsize{把$\textbf{C}$看做一个查询表}};
}

\visible<4->{
\node [anchor=north west] (wordvector) at ([yshift=-1em]embedinglabel2.south west) {\tiny{$(0,0,{\red 1},...)$}};
\node [anchor=west] (timeslabel) at ([xshift=-0.3em]wordvector.east) {\footnotesize{$\times$}};
\node [anchor=north west,inner sep=2pt] (embeddingmatrix) at ([xshift=1em]wordvector.north east) {\tiny{$\begin{pmatrix} 0 & 1 & 3 \\ .2 & -1 & .3 \\ 1 & 7 & .3 \\ ... \end{pmatrix}$}};
\node [anchor=south,inner sep=1pt] (wordvectorlabel) at (wordvector.north) {\scriptsize{$w_{i-1}$}};
\node [anchor=south,inner sep=1pt] (embeddingmatrixlabel) at (embeddingmatrix.north) {\scriptsize{$\textbf{C}$}};
\node [anchor=north west] (selectedlabel) at ([yshift=-2em]wordvector.south west) {\scriptsize{在把$\textbf{C}$中索引到的行输出(i.e., $e_{i-1}$)}};

\begin{pgfonlayer}{background}
\visible<4->{
\node [anchor=north west,fill=blue!20!white,minimum height=0.6em,minimum width=5.0em] (selected) at ([yshift=-1.3em]embeddingmatrix.north west) {};
}
\end{pgfonlayer}
\draw [->] ([xshift=0.15em,yshift=0.3em]wordvector.south) .. controls +(south:0.3) and +(west:0.5) .. (selected.west);
}

\visible<5->{
\node [anchor=south west] (hiddenlabel0) at ([yshift=5em]embedinglabel0.north west) {\scriptsize{{\color{orange} \textbf{多层神经网络}}}};
\node [anchor=north west] (hiddenlabel1) at ([yshift=0.3em]hiddenlabel0.south west) {\scriptsize{$[e_0,e_1,e_2]$表示把三个向量级联在一起，}};
\node [anchor=north west] (hiddenlabel2) at ([yshift=0.3em]hiddenlabel1.south west) {\scriptsize{之后经过两层网络，最后通过Softmax输出}};
\node [anchor=north west] (hiddenlabel3) at ([yshift=0.3em]hiddenlabel2.south west) {\scriptsize{注意，$h_0\textbf{U}$得到所有词的表示(向量)，}};
\node [anchor=north west] (hiddenlabel4) at ([yshift=0.3em]hiddenlabel3.south west) {\scriptsize{Softmax确保输出词汇表上的一个分布}};
}

\visible<6->{
\node [anchor=south west] (directlabel0) at ([yshift=1em]hiddenlabel0.north west) {\scriptsize{\alert{\textbf{底层向上层的直接连接(可选)}}}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% FNNLM implementation
\begin{frame}{前馈神经网络语言模型(FNN LM)的实现}

\begin{itemize}
\item 实现非常简单，几行代码
    \begin{itemize}
    \item 细节1：做batching时可以把$w[i]$进行扩展，比如放入多个词
    \item 细节2：TanH一般会用HardTanH实现，因为TanH容易溢出
    \end{itemize}
\end{itemize}

\begin{tcolorbox}
[bicolor,sidebyside,righthand width=3.8cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]
 {\scriptsize
\begin{tabbing}
\texttt{XTensor w[3], e[3], h0, y;} \\
\texttt{XTensor C, H, d, U;} \\
\texttt{...}\\

\texttt{} \\
\texttt{for(unsigned i = 0; i < 3; i++)} \\
\texttt{\ \ \ \ e[i] = MMul(w[i], C);}\\
\texttt{e01 = Concatenate(e[0], e[1], -1);}\\
\texttt{e = Concatenate(e01, e[2], -1);}\\

\texttt{} \\
\texttt{h0 = TanH(MMul(e, H) + d);}\\
\texttt{y = Softmax(MMul(h0, U));}\\

\texttt{} \\
\texttt{for(unsigned k = 0; k < size; k++)\{} \\
\texttt{} \ \ \ \ ... // \alert{\texttt{y}}的第$k$元素表示 $\textrm{P}(w|...)$\\
\texttt{} \ \ \ \ ... // $w$为词汇表里第$k$个词\\
\texttt{\}}

\end{tabbing}
}
\tcblower
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\scriptsize{$w_{i-3}$}};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {\scriptsize{$w_{i-2}$}};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {\scriptsize{$w_{i-1}$}};
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
\node [anchor=south,draw,inner sep=3pt,align=left] (e0) at ([yshift=1.0em]w0.north) {\tiny{$e_0:$}\\\tiny{$w_{i-3} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,align=left] (e1) at ([yshift=1.0em]w1.north) {\tiny{$e_1:$}\\\tiny{$w_{i-2} \textbf{C}$}};
\node [anchor=south,draw,inner sep=3pt,align=left] (e2) at ([yshift=1.0em]w2.north) {\tiny{$e_2:$}\\\tiny{$w_{i-1} \textbf{C}$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h0) at ([yshift=1.5em]e1.north) {\tiny{$h_0=\textrm{Tanh}([e_0,e_1,e_2] \textbf{H} + \textbf{d})$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (h1) at ([yshift=1.5em]h0.north) {\tiny{$y=\textrm{Softmax}(h_0 \textbf{U})$}};
\node [anchor=south] (ylabel) at ([yshift=1em]h1.north) {\scriptsize{$\textrm{P}(w_i|w_{i-3}w_{i-2}w_{i-1})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=-2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=2em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]h0.north) -- ([yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([yshift=-0.1em]ylabel.south);
\end{scope}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
\vspace{-0.5em}
\footnotesize{注: size表示词汇表大小}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经语言模型给我们带来了什么
\begin{frame}{神经语言建模的意义}

\begin{itemize}
\item Bengio el al. (2003)中有待讨论的问题
    \begin{enumerate}
    \item 神经网络每一层究竟学到了什么 \\
    词汇、句法？还是其它一些知识？如何解释？
    \item 网络的层数变多会怎样 - 10层、20层、100层的网络 \\
    \# of layers: 10 $\to$ 20 $\to$ 100 $\to$ 1000
    \item 超参(比如隐藏层大小)如何选择 - 不同任务的最优设置\\
    单词的分布式表示维度多大好？\\
    隐层多大好？\\
    激活函数如何选择？\\
    ...
    \end{enumerate}
\item<2-> 从FNN LM得到的启发
    \begin{itemize}
    \item 重新定义词是什么 - 非词典里的一项，而是一个实数向量
    \item 多层神经网络可以很好的表示单词之间的(短距离)依赖
    \item $n$-gram的生成概率可以使用连续空间函数描述，缓解数据稀疏问题，模型并不需要记录完整的$n$-gram
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络
\begin{frame}{循环神经网络(Recurrent Neural Networks)}

\begin{itemize}
\item FNN LM固然有效，但是和传统的$n$-gram LM一样，需要依赖\alert{有限上下文}假设
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$...$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$w_{m-n+1}$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$...$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,ublue] (w5.south).. controls +(210:0.5) and +(-30:0.5) .. (w3.south);
\draw [->,thick,red] (w5.north).. controls +(150:1) and +(30:1) .. (w1.north);
\draw [->,very thick,ublue] ([xshift=-5em,yshift=1em]w0.west) -- ([xshift=-6.5em,yshift=1em]w0.west) node [pos=0,right] {\scriptsize{依赖}};
\draw [->,very thick,red] ([xshift=-5em,yshift=-0.5em]w0.west) -- ([xshift=-6.5em,yshift=-0.5em]w0.west) node [pos=0,right] {\scriptsize{不依赖}};

\end{scope}
\end{tikzpicture}
\end{center}
\item<2-> 能否直接对原始问题建模，即定义函数$g$，对于任意的$w_{1} ... w_{m}$有
    \vspace{-0.5em}
    \begin{displaymath}
    g(w_{1} ... w_{m}) \approx \textrm{P}(w_m | w_{1} ... w_{m-1})
    \end{displaymath}
\item<3-> \textbf{循环神经网络(RNNs)}可以很好的解决上述问题，因此也被成功的应用于语言建模任务
	\begin{itemize}
	\item 它假设每个词的生成都依赖已经生成的所有词
	\item 对于不同位置的词的生成概率都可以用同一个函数描述
	\end{itemize}
	
        \textbf{Recurrent Neural Network Based Language Model}\\
        \textbf{Mikolov et al., 2010, In Proc. of Interspeech, 1045-1048}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的结构
\begin{frame}{循环单元}

\begin{itemize}
\item 有输入序列$(\textbf{x}_0,\textbf{x}_1,...,\textbf{x}_t,...)$，其中$\textbf{x}_t$表示序列中第$t$个元素，也被称作\alert{时刻$t$}输入。它所对应的输出序列是$(\textbf{y}_0,\textbf{y}_1,...,\textbf{y}_t,...)$。 在循环神经网络中，每个时刻的输出都可以用同一个\alert{循环单元}来描述。\visible<2->{对于语言模型，一种简单的结构：}

\visible<2->{
{\small
\begin{tcolorbox}
[bicolor,sidebyside,righthand width=4.3cm,size=title,frame engine=empty,
 colback=blue!10!white,colbacklower=black!5!white]

\begin{eqnarray}
\textbf{y}_t & = & \textrm{Softmax}(\textbf{h}_t \textbf{V}) \nonumber \\
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W}) \nonumber
\end{eqnarray}
\footnotesize{$\textbf{h}_t$: $t$时刻的隐层状态\\
$\textbf{h}_{t-1}$: $t-1$时刻的隐层状态\\
$\textbf{V}, \textbf{U}, \textbf{W}$: 参数
}
\tcblower
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west,inner sep=3pt,minimum width=8em] (h) at (0,0) {\tiny{$\textbf{h}_t  =  \textrm{TanH}(\textbf{x}_t \textbf{U} + \textbf{h}_{t-1} \textbf{W})$}};
\node [anchor=south west,inner sep=3pt] (r) at ([yshift=-0.2em]h.north west) {\tiny{循环单元:}};
\begin{pgfonlayer}{background}
\node [rectangle,draw,inner sep=0em,fill=green!20!white] [fit = (r) (h)] (rbox) {};
\end{pgfonlayer}
\node [anchor=south,draw,minimum width=8em,fill=green!20!white] (y) at ([yshift=1.5em]rbox.north) {\tiny{$\textbf{y}_t = \textrm{Softmax}(\textbf{h}_t \textbf{V})$}};
\node [anchor=south,inner sep=2pt] (output) at ([yshift=1em]y.north) {\scriptsize{$\textbf{y}_t$}};
\node [anchor=north,inner sep=2pt] (input) at ([yshift=-1em]h.south) {\scriptsize{$\textbf{x}_t$}};
\draw [->,thick] (input.north) -- ([yshift=-0.1em]rbox.south);
\draw [->,thick] ([yshift=0.1em]rbox.north) -- ([yshift=-0.1em]y.south) node [pos=0.5,left] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([yshift=0.1em]y.north) -- (output.south);
\draw [->,thick] ([xshift=0.1em]rbox.east) -- ([xshift=1em]rbox.east) node [pos=1,above] {\tiny{$\textbf{h}_t$}};
\draw [->,thick] ([xshift=-1em]rbox.west) -- ([xshift=-0.1em]rbox.west) node [pos=0,above] {\tiny{$\textbf{h}_{t-1}$}};

\end{scope}
\end{tikzpicture}
\end{center}
\end{tcolorbox}
}
}

\item<3-> \textbf{如何体现循环？}$t$时刻的状态是$t-1$时刻状态的函数，这个过程可以不断被执行
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环神经网络的“记忆”
\begin{frame}{循环神经网络的``记忆''}
\begin{itemize}
\item 循环神经网络可以记忆任意长度的历史，因此可以非常适合处理不定长的序列，比如自然语言句子
    \begin{itemize}
    \item 注意：$\textbf{h}_{t-1}$可以被传递到后续状态
    \end{itemize}
\end{itemize}

\vspace{-1em}
\begin{eqnarray}
\textbf{h}_t & = & \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \nonumber \\
\visible<2->{
\textbf{h}_{t+1} & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textbf{h}_{t} \textbf{W}) \nonumber \\
                 & = & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \nonumber \\
                 }
\visible<3->{
\textbf{h}_{t+2} & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \textbf{h}_{t+1} \textbf{W}) \nonumber \\
                 & = & \textrm{TanH}(\textbf{x}_{t+2} \textbf{U} + \nonumber \\
                 &   & \textrm{TanH}(\textbf{x}_{t+1} \textbf{U} + \textrm{TanH}(\textbf{x}_t \textbf{U} + \alert{\textbf{h}_{t-1}} \textbf{W}) \textbf{W}) \textbf{W}) \nonumber
                 }
\end{eqnarray}

\vspace{-1em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node1) at (0,0) {\scriptsize{RNN Cell}};
\visible<2->{
\node [anchor=west,rnnnode] (node2) at ([xshift=4.5em]node1.east) {\scriptsize{RNN Cell}};
}
\visible<3->{
\node [anchor=west,rnnnode] (node3) at ([xshift=4.5em]node2.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north] (x1) at ([yshift=-1em]node1.south) {\footnotesize{$\textbf{x}_{t}$}};
\visible<2->{
\node [anchor=north] (x2) at ([yshift=-1em]node2.south) {\footnotesize{$\textbf{x}_{t+1}$}};
}
\visible<3->{
\node [anchor=north] (x3) at ([yshift=-1em]node3.south) {\footnotesize{$\textbf{x}_{t+2}$}};
}
\node [anchor=south] (h1) at ([yshift=1em]node1.north) {\footnotesize{$\textbf{h}_{t}$}};
\visible<2->{
\node [anchor=south] (h2) at ([yshift=1em]node2.north) {\footnotesize{$\textbf{h}_{t+1}$}};
}
\visible<3->{
\node [anchor=south] (h3) at ([yshift=1em]node3.north) {\footnotesize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=-1.0em]node1.west)--([xshift=-0.1em]node1.west) node [pos=0,left] {\scriptsize{$\alert{\textbf{h}_{t-1}}$}};
\visible<3->{
\draw [->,thick] ([xshift=0.1em]node3.east)--([xshift=1.0em]node3.east) node [pos=1,right] {\scriptsize{$\textbf{h}_{t+2}$}};
}
\draw [->,thick] ([xshift=0.1em]node1.east)--([xshift=-0.1em]node2.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t}(\alert{\textbf{h}_{t-1}})$}};
\visible<2->{
\draw [->,thick] ([xshift=0.1em]node2.east)--([xshift=-0.1em]node3.west) node [pos=0.5,above] {\tiny{$\textbf{h}_{t+1}(\textbf{h}_{t}(\alert{\textbf{h}_{t-1}}))$}};
}
\draw [->,thick] (x1.north)--([yshift=-0.1em]node1.south);
\visible<2->{
\draw [->,thick] (x2.north)--([yshift=-0.1em]node2.south);
}
\visible<3->{
\draw [->,thick] (x3.north)--([yshift=-0.1em]node3.south);
}
\draw [->,thick] ([yshift=0.1em]node1.north)--(h1.south);
\visible<2->{
\draw [->,thick] ([yshift=0.1em]node2.north)--(h2.south);
}
\visible<3->{
\draw [->,thick] ([yshift=0.1em]node3.north)--(h3.south);
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 基于循环神经网络的语言模型
\begin{frame}{基于循环神经网络的语言模型(RNN LM)}
\begin{itemize}
\item 循环神经网络可以被直接用于语言模型
    \begin{itemize}
    \item<2-> 与FNN LM类似，首先把词从one-hot表示转换成分布式表示
    \item<3-> $t$时刻预测$\textrm{P}(x_{t+1}|x_1...x_{t})$
    \item<4-> 可以叠加更多的层
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\visible<3->{
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};
}
\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{$e_1=w_1\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{$e_2=w_2\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{$e_3=w_3\textbf{C}$}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{$e_4=w_4\textbf{C}$}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$w_1$}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{$w_2$}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{$w_3$}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{$w_4$}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\visible<4->{
\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};

\node [anchor=south,rnnnode,fill=blue!30!white] (node31) at ([yshift=1.5em]node21.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node32) at ([yshift=1.5em]node22.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node33) at ([yshift=1.5em]node23.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node34) at ([yshift=1.5em]node24.north) {\scriptsize{Softmax($\cdot$)}};
}

\visible<3>{
\node [anchor=south,rnnnode,fill=blue!30!white] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=blue!30!white] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{Softmax($\cdot$)}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};
}

\visible<4->{
\draw [->,thick] ([yshift=0.1em]node31.north)--([yshift=1em]node31.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_2)$}};
\draw [->,thick] ([yshift=0.1em]node32.north)--([yshift=1em]node32.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_3|w_2)$}};
\draw [->,thick] ([yshift=0.1em]node33.north)--([yshift=1em]node33.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_4|w_2 w_3)$}};
\draw [->,thick] ([yshift=0.1em]node34.north)--([yshift=1em]node34.north) node[pos=1,above] {\scriptsize{$\textrm{P}(w_5|w_2 w_3 w_4)$}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);

\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);
}

\visible<3->{
\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);
}

\end{scope}
\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 循环单元的设计、梯度消失、训练等问题
\begin{frame}{进一步的问题}
\begin{itemize}
\item \textbf{循环单元设计}：循环单元就是一个函数，入读当前时刻的输入和上一时刻的状态，生成当前时刻的状态
    \begin{displaymath}
    \textbf{h}_t = g(\textbf{x}_t, \textbf{h}_{t-1}; \theta)
    \end{displaymath}
    很多种方式设计$g(\cdot)$，如著名的LSTM、GRU等
\item<2-> \textbf{梯度消失/爆炸}：随着序列变长，在反向传播时循环神经网络会产生更多的局部梯度相乘计算，这会导致\alert{梯度消失/爆炸问题}
    \begin{displaymath}
    \underbrace{0.2 \times 0.3 \times ... \times 0.2 \times 0.1}_{\text{100项}} \approx 0
    \end{displaymath}
    \vspace{-0.8em}
    \begin{itemize}
    \item 可以考虑梯度裁剪，限制梯度的大小
    \item 也可以引入short-cut connection，如残差网络
    \end{itemize}
\item<2-> \textbf{训练}：有了自动微分，这不是个大问题 :)
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 自注意力机制
\begin{frame}{自注意力机制(Self-Attention)}

\begin{itemize}
\item RNN LM效果很好，但是当序列过长,词汇之间信息传递路径过长，容易出现梯度消失、梯度爆炸的问题。
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w1.north).. controls +(130:0.5) and +(50:0.5) .. (w0.north);
\draw [->,thick,red] (w2.north).. controls +(130:0.5) and +(50:0.5) .. (w1.north);
\draw [->,thick,red] ([yshift=0.2em]w3.north).. controls +(130:0.5) and +(50:0.5) .. (w2.north);
\draw [->,thick,red] (w4.north).. controls +(130:0.5) and +(50:0.5) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};


\end{scope}

\end{tikzpicture}
\end{center}
\item<2-> 能否将不同位置之间的词汇间信息传递的距离拉近为1？


\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,-2) {$w_1$};
\node [anchor=west] (w1) at ([xshift=0.5em]w0.east) {$w_2$};
\node [anchor=west] (w2) at ([xshift=0.5em]w1.east) {$w_3$};
\node [anchor=west] (w3) at ([xshift=0.5em]w2.east) {$...$};
\node [anchor=west] (w4) at ([xshift=0.5em]w3.east) {$w_{m-1}$};
\node [anchor=west,fill=green!20!white] (w5) at ([xshift=0.5em]w4.east) {$w_{m}$};
\draw [->,thick,red] (w5.north).. controls +(100:0.8) and +(50:0.8) .. (w0.north);
\draw [->,thick,red] (w5.north).. controls +(110:0.7) and +(50:0.7) .. (w1.north);
\draw [->,thick,red] (w5.north).. controls +(120:0.6) and +(50:0.6) .. ([yshift=0.2em]w3.north);
\draw [->,thick,red] (w5.north).. controls +(130:0.5) and +(50:0.5) .. (w4.north);
\draw [->,very thick,red] ([xshift=-5em]w0.west) -- ([xshift=-6.5em]w0.west) node [pos=0,right] {\scriptsize{信息传递}};


\end{scope}

\end{tikzpicture}
\end{center}
\item<3-> \textbf{自注意力机制(Self-Attention)}可以很好的解决长距离依赖问题，在长距离语言建模任务取得了很好的效果
	\begin{itemize}
	\item 更充分的表示序列不同位置之间的复杂关系
	\item 并行训练，提高效率
	\end{itemize}
	
        \textbf{Attention Is All You Need}\\
        \textbf{Vaswani et al., 2017, In Proc. of Neural Information Processing Systems, 6000-6010}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% Transformer architecture
\begin{frame}{Transformer语言模型(Vaswani et al., 2017)}
\begin{itemize}
\item 一个简单的例子
\end{itemize}

\vspace{-2em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (w0) at (0,0) {\footnotesize{$w_{0}$}};
\node [anchor=west] (w1) at ([xshift=5em]w0.east) {\footnotesize{$w_{1}$}};
\node [anchor=west] (w2) at ([xshift=5em]w1.east) {\footnotesize{$w_{2}$}};
\node [anchor=west] (w3) at ([xshift=5em]w2.east) {\footnotesize{$w_{3}$}};
\node [anchor=north] (index0) at ([yshift=0.5em]w0.south) {\tiny(index)};
\node [anchor=north] (index1) at ([yshift=0.5em]w1.south) {\tiny(index)};
\node [anchor=north] (index2) at ([yshift=0.5em]w2.south) {\tiny(index)};
\node [anchor=north] (index3) at ([yshift=0.5em]w3.south) {\tiny(index)};
\node [anchor=south,draw,inner sep=3pt] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{0} \textbf{C} + \textrm{PE}(0)$}};
\node [anchor=south,draw,inner sep=3pt] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{1} \textbf{C} + \textrm{PE}(1)$}};
\node [anchor=south,draw,inner sep=3pt] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{2} \textbf{C} + \textrm{PE}(2)$}};
\node [anchor=south,draw,inner sep=3pt] (e3) at ([yshift=1em]w3.north) {\tiny{$\textbf{e}_3=w_{3} \textbf{C} + \textrm{PE}(3)$}};

\node [anchor=south,draw,inner sep=3pt] (h0) at ([xshift=-0.5em, yshift=1.5em]e0.north) {\tiny{$\textbf{h}_{0}=\textrm{SelfAtt}(\textbf{e}_0,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt] (h1) at ([xshift=0.5em, yshift=1.5em]e1.north) {\tiny{$\textbf{h}_{1}=\textrm{SelfAtt}(\textbf{e}_1,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt] (h2) at ([xshift=1.5em, yshift=1.5em]e2.north) {\tiny{$\textbf{h}_{2}=\textrm{SelfAtt}(\textbf{e}_2,\textbf{e}_3)$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (f1) at ([xshift=0.5em, yshift=1.5em]h2.north) {\tiny{$\textbf{f}_3=\textrm{FNN}([\textbf{h}_0,\textbf{h}_1,\textbf{h}_2,\textbf{e}_3])$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt] (o1) at ([yshift=1em]f1.north) {\tiny{$\textbf{y}=\textrm{Softmax}(f_3 \textbf{U})$}};
\node [anchor=south] (ylabel) at ([yshift=1em]o1.north) {\footnotesize{$\textrm{P}(w_4|w_{0}w_{1}w_{2}w_{3})$}};

\draw [->] ([yshift=0.1em]w0.north) -- ([yshift=-0.1em]e0.south);
\draw [->] ([yshift=0.1em]w1.north) -- ([yshift=-0.1em]e1.south);
\draw [->] ([yshift=0.1em]w2.north) -- ([yshift=-0.1em]e2.south);
\draw [->] ([yshift=0.1em]w3.north) -- ([yshift=-0.1em]e3.south);
\draw [->] ([yshift=0.1em]e0.north) -- ([xshift=0em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([xshift=-0.5em,yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([xshift=-1em,yshift=-0.1em]h2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h0.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=1em,yshift=-0.1em]h2.south);

\draw [->] ([yshift=0.1em]h0.north) -- ([xshift=-2em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([xshift=2em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]h1.north) -- ([xshift=-1em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]h2.north) -- ([xshift=0em,yshift=-0.1em]f1.south);
\draw [->] ([yshift=0.1em]f1.north) -- ([yshift=-0.1em]o1.south);
\draw [->] ([yshift=0.1em]o1.north) -- ([yshift=-0.1em]ylabel.south);

\visible<2->{
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e0) at ([yshift=1em]w0.north) {\tiny{$\textbf{e}_0=w_{0} \textbf{C} + \textrm{PE}(0)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e1) at ([yshift=1em]w1.north) {\tiny{$\textbf{e}_1=w_{1} \textbf{C} + \textrm{PE}(1)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e2) at ([yshift=1em]w2.north) {\tiny{$\textbf{e}_2=w_{2} \textbf{C} + \textrm{PE}(2)$}};
\node [anchor=south,draw,inner sep=3pt,fill=blue!20!white] (e3) at ([yshift=1em]w3.north) {\tiny{$\textbf{e}_3=w_{3} \textbf{C} + \textrm{PE}(3)$}};
}

\visible<2->{
\node [anchor=west] (embedinglabel0) at ([xshift=-5em,yshift=-2em]w0.south) {\scriptsize{{\blue \textbf{词的分布式表示}}}};
\node [anchor=north west] (embedinglabel1) at ([yshift=0.3em]embedinglabel0.south west) {\scriptsize{前面已经介绍过！}};
\node [anchor=north west] (embedinglabel2) at ([yshift=0.3em]embedinglabel1.south west) {\scriptsize{基于One-hot表示获得}};
\node [anchor=north west] (embedinglabel3) at ([yshift=0.3em]embedinglabel2.south west) {\scriptsize{新加入位置向量PE}};
}

\visible<3->{
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h0) at ([xshift=-0.5em, yshift=1.5em]e0.north) {\tiny{$\textbf{h}_{0}=\textrm{SelfAtt}(\textbf{e}_0,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h1) at ([xshift=0.5em, yshift=1.5em]e1.north) {\tiny{$\textbf{h}_{1}=\textrm{SelfAtt}(\textbf{e}_1,\textbf{e}_3)$}};
\node [anchor=south,draw,inner sep=3pt,fill=ugreen!20!white] (h2) at ([xshift=1.5em, yshift=1.5em]e2.north) {\tiny{$\textbf{h}_{2}=\textrm{SelfAtt}(\textbf{e}_2,\textbf{e}_3)$}};
}

\visible<3->{
\node [anchor=west] (selfattlabel0) at ([xshift=3em]embedinglabel0.east) {\scriptsize{{\color{ugreen} \textbf{自注意力机制}}}};
\node [anchor=west] (selfattlabel1) at ([yshift=-0.3em]selfattlabel0.south west) {\scriptsize{计算词汇之间的相关度}};
\node [anchor=west] (selfattlabel2) at ([yshift=-0.3em]selfattlabel1.south west) {\scriptsize{多头自注意力机制}};
\node [anchor=west] (directlabel0) at ([yshift=-0.3em]selfattlabel2.south west) {\scriptsize{\alert{\textbf{后面将会介绍}}}};
}

\visible<4->{
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (f1) at ([xshift=0.5em, yshift=1.5em]h2.north) {\tiny{$\textbf{f}_3=\textrm{FNN}([\textbf{h}_0,\textbf{h}_1,\textbf{h}_2,\textbf{e}_3])$}};
\node [anchor=south,draw,minimum width=9em,inner sep=3pt,fill=orange!20!white] (o1) at ([yshift=1em]f1.north) {\tiny{$\textbf{y}=\textrm{Softmax}(f_3 \textbf{U})$}};
}

\visible<4->{
\node [anchor=west] (ffnlabel0) at ([xshift=3em]selfattlabel0.east) {\scriptsize{{\color{orange} \textbf{前馈神经网络和输出层}}}};
\node [anchor=west] (ffnlabel1) at ([yshift=-0.3em]ffnlabel0.south west) {\scriptsize{双层全连接网络}};
\node [anchor=west] (ffnlabel2) at ([yshift=-0.3em]ffnlabel1.south west) {\scriptsize{激活函数为Relu}};
\node [anchor=west] (ffnlabel3) at ([yshift=-0.3em]ffnlabel2.south west) {\scriptsize{最后通过Softmax输出}};
}


\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% Transformer architecture
\begin{frame}{Transformer语言模型(Vaswani et al., 2017)}
\begin{itemize}
\item 多头注意力机制
\end{itemize}

\vspace{-1.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear0) at (0,0) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear01) at ([shift={(-0.2em,-0.2em)}]Linear0.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear02) at ([shift={(-0.2em,-0.2em)}]Linear01.south west) {\tiny{Linear}};
\node [anchor=north] (Q) at ([xshift=0em,yshift=-1em]Linear02.south) {\footnotesize{$Q$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear1) at ([xshift=1.5em]Linear0.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear11) at ([shift={(-0.2em,-0.2em)}]Linear1.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear12) at ([shift={(-0.2em,-0.2em)}]Linear11.south west) {\tiny{Linear}};
\node [anchor=north] (K) at ([xshift=0em,yshift=-1em]Linear12.south) {\footnotesize{$K$}};

\node [anchor=west,draw=black!30,inner sep=4pt,fill=ugreen!20!white] (Linear2) at ([xshift=1.5em]Linear1.east) {\tiny{Linear}};
\node [anchor=south west,draw=black!50,fill=ugreen!20!white,draw,inner sep=4pt] (Linear21) at ([shift={(-0.2em,-0.2em)}]Linear2.south west) {\tiny{Linear}};
\node [anchor=south west,fill=ugreen!20!white,draw,inner sep=4pt] (Linear22) at ([shift={(-0.2em,-0.2em)}]Linear21.south west) {\tiny{Linear}};
\node [anchor=north] (V) at ([xshift=0em,yshift=-1em]Linear22.south) {\footnotesize{$V$}};

\node [anchor=south,draw=black!30,minimum width=9em,inner sep=4pt,fill=blue!20!white] (Scale) at ([yshift=1em]Linear1.north) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,draw=black!50,minimum width=9em,fill=blue!20!white,draw,inner sep=4pt] (Scale1) at ([shift={(-0.2em,-0.2em)}]Scale.south west) {\tiny{Scaled Dot-Product Attention}};
\node [anchor=south west,fill=blue!20!white,draw,minimum width=9em,inner sep=4pt] (Scale2) at ([shift={(-0.2em,-0.2em)}]Scale1.south west) {\tiny{Scaled Dot-Product Attention}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=yellow!30] (Concat) at ([yshift=1em]Scale2.north) {\tiny{Concat}};

\node [anchor=south,draw,minimum width=4em,inner sep=4pt,fill=ugreen!20!white] (Linear) at ([yshift=1em]Concat.north) {\tiny{Linear}};


\draw [->] ([yshift=0.1em]Q.north) -- ([yshift=-0.1em]Linear02.south);
\draw [-,draw=black!50] ([yshift=0.1em]Q.north) -- ([xshift=0.2em,yshift=-0.1em]Linear02.south);
\draw [-,draw=black!30] ([yshift=0.1em]Q.north) -- ([xshift=0.4em,yshift=-0.1em]Linear02.south);

\draw [->] ([yshift=0.1em]K.north) -- ([yshift=-0.1em]Linear12.south);
\draw [-,draw=black!50] ([yshift=0.1em]K.north) -- ([xshift=0.2em,yshift=-0.1em]Linear12.south);
\draw [-,draw=black!30] ([yshift=0.1em]K.north) -- ([xshift=0.4em,yshift=-0.1em]Linear12.south);

\draw [->] ([yshift=0.1em]V.north) -- ([yshift=-0.1em]Linear22.south);
\draw [-,draw=black!50] ([yshift=0.1em]V.north) -- ([xshift=0.2em,yshift=-0.1em]Linear22.south);
\draw [-,draw=black!30] ([yshift=0.1em]V.north) -- ([xshift=0.4em,yshift=-0.1em]Linear22.south);

\draw [->] ([yshift=0em]Linear02.north) -- ([yshift=1em]Linear02.north);
\draw [-,draw=black!50] ([yshift=0em]Linear01.north) -- ([yshift=0.8em]Linear01.north);
\draw [-,draw=black!30] ([yshift=0em]Linear0.north) -- ([yshift=0.6em]Linear0.north);

\draw [->] ([yshift=0em]Linear12.north) -- ([yshift=1em]Linear12.north);
\draw [-,draw=black!50] ([yshift=0em]Linear11.north) -- ([yshift=0.8em]Linear11.north);
\draw [-,draw=black!30] ([yshift=0em]Linear1.north) -- ([yshift=0.6em]Linear1.north);

\draw [->] ([yshift=0em]Linear22.north) -- ([yshift=1em]Linear22.north);
\draw [-,draw=black!50] ([yshift=0em]Linear21.north) -- ([yshift=0.8em]Linear21.north);
\draw [-,draw=black!30] ([yshift=0em]Linear2.north) -- ([yshift=0.6em]Linear2.north);

\draw [->] ([yshift=0em]Scale2.north) -- ([yshift=0em]Concat.south);
\draw [-,draw=black!50] ([yshift=0em]Scale1.north) -- ([yshift=0.8em]Scale1.north);
\draw [-,draw=black!30] ([yshift=0em]Scale.north) -- ([yshift=0.6em]Scale.north);

\draw [->] ([yshift=0em]Concat.north) -- ([yshift=0em]Linear.south);
\draw [->] ([yshift=0em]Linear.north) -- ([yshift=1em]Linear.north);

\node [anchor=west] (Multiheadlabel0) at ([xshift=-5em,yshift=-1.2em]Q.south) {\scriptsize{{\blue \textbf{多头注意力}}}};
\node [anchor=north west] (Multiheadlabel1) at ([yshift=0em]Multiheadlabel0.south west) {\scriptsize{$MultiHead(Q,K,V)=Concat(head_1,...head_n)W^0$}};
\node [anchor=north west] (Multiheadlabel2) at ([yshift=0.2em]Multiheadlabel1.south west) {\scriptsize{把输入压缩成多个维度较小的输出，分别做自注意力}};
\node [anchor=north west] (Multiheadlabel3) at ([yshift=0.2em]Multiheadlabel2.south west) {\scriptsize{再把结果级联，经过线性变换得到最终输出}};


\visible<2->{
\node [anchor=south west,fill=white,draw,inner sep=4pt,minimum width=3.5em,fill=blue!20!white] (MatMul) at ([xshift=8em]Linear22.south west) {\tiny{MatMul}};
\node [anchor=north] (Q1) at ([xshift=-1em,yshift=-1em]MatMul.south) {\footnotesize{$Q$}};
\node [anchor=north] (K1) at ([xshift=1em,yshift=-1em]MatMul.south) {\footnotesize{$K$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30] (Scale3) at ([yshift=1em]MatMul.north) {\tiny{Scale}};
\node [anchor=south,draw,inner sep=4pt,fill=purple!20,minimum width=3.5em] (Mask) at ([yshift=0.8em]Scale3.north) {\tiny{Mask(opt.)}};
\node [anchor=south,draw,inner sep=4pt,fill=ugreen!20!white] (SoftMax) at ([yshift=1em]Mask.north) {\tiny{SoftMax}};
\node [anchor=south,draw,minimum width=3.5em,inner sep=4pt,fill=blue!20!white] (MatMul1) at ([xshift=1.5em,yshift=1em]SoftMax.north) {\tiny{MatMul}};
\node [anchor=north] (V1) at ([xshift=2em]K1.north) {\footnotesize{$V$}};
\node [anchor=north] (null) at ([yshift=0.8em]MatMul1.north) {};

\node [rectangle,draw, densely dashed,inner sep=0.4em] [fit = (MatMul) (MatMul1) (Q1) (K1) (V1) (null)] (inputshadow) {};

\draw [->] ([yshift=0.1em]Q1.north) -- ([xshift=-1em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]K1.north) -- ([xshift=1em,yshift=-0.1em]MatMul.south);
\draw [->] ([yshift=0.1em]MatMul.north) -- ([yshift=-0.1em]Scale3.south);
\draw [->] ([yshift=0.1em]Scale3.north) -- ([yshift=-0.1em]Mask.south);
\draw [->] ([yshift=0.1em]Mask.north) -- ([yshift=-0.1em]SoftMax.south);
\draw [->] ([yshift=0.1em]SoftMax.north) -- ([yshift=0.9em]SoftMax.north);
\draw [->] ([yshift=0.1em]V1.north) -- ([yshift=9.1em]V1.north);
\draw [->] ([yshift=0.1em]MatMul1.north) -- ([yshift=0.8em]MatMul1.north);

\draw [->,dashed,red,thick] ([xshift=0.1em]Scale.east) .. controls +(east:1) and +(west:1) .. ([xshift=-0.1em,yshift=1em]inputshadow.west);

\node [anchor=west] (Attentionlabel0) at ([xshift=-2em,yshift=-1.2em]Q1.south) {\scriptsize{{\color{ugreen} \textbf{基于点乘的自注意力}}}};
\node [anchor=north west] (Attentionlabel1) at ([yshift=0.3em]Attentionlabel0.south west) {\scriptsize{$head_i=softmax(\frac{QK^{T}}{\sqrt{d_k}})V$}};
\node [anchor=north west] (Attentionlabel2) at ([yshift=0.6em]Attentionlabel1.south west) {\scriptsize{计算得到位置向量的加权和}};
\node [anchor=north west] (Attentionlabel3) at ([yshift=0.2em]Attentionlabel2.south west) {\scriptsize{Q,K,V都是相同的}};
}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% evaluation
\begin{frame}{语言模型评价}
\begin{itemize}
\item 语言模型的评价指标 - 困惑度(Perplexity, PPL)
\begin{itemize}
\item 语言模型预测一个语言样本的能力
\item 困惑度越低，建模的效果越好
\end{itemize}
\vspace{0.5em}
\begin{displaymath}
\textrm{PPL}(w_1 ... w_m)=\textrm{P}(w_1 ... w_m)^{-1/m}
\end{displaymath}
\vspace{-0.5em}
\item<2-> Penn Treebank(PTB)上的评价结果
\end{itemize}
\vspace{0.0em}
\visible<2->{
\begin{tabular}{l | l | l | r}
模型 & 作者 & 年份 & PPL \\ \hline
FNN LM & Bengio et al. & 2003 & 162.2 \\
RNN LM & Mikolov et al. & 2010 & 124.7 \\
RNN-LDA LM & Mikolov et al. & 2012 & 92.0 \\
RNN(LSTM) LM & Zaremba et al. & 2014 & 78.4 \\
RHN & Zilly et al. & 2016 & 65.4 \\
RNN(AWD-LSTM) LM & Merity et al. & 2018 & 58.8 \\
GPT-2 (Transformer) & Radford et al. & 2019 & 35.7
\end{tabular}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{词嵌入}

%%%------------------------------------------------------------------------------------------------------------
%%% 词的one-hot和distributed表示
\begin{frame}{单词的表示}
\begin{itemize}
\item 如何表示一个单词？
    \begin{itemize}
    \item \textbf{One-hot}: 假如有一个词典$V$，里面包含10k个单词，并进行编号。每个单词都可以表示为10k维的one-hot向量，仅在编号那个维度为1，其它为0
    \item<2-> \textbf{Distributed}: 类似于神经语言模型，每个单词可以被表示为一个实数向量，每一维都对应一种``属性'' - \alert{词嵌入}
    \end{itemize}
\end{itemize}

\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} 0 \\ 1 \\ 0 \\ 0 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 0 \\ 0 \\ 0 \\ 1 \\ 0 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ \ \ 你}_1 \\ \textrm{\ \ 桌子}_2 \\ \textrm{\ \ \ \ \ 他}_3 \\ \textrm{\ \ 椅子}_4 \\ \textrm{\ \ 我们}_5 \\ ... \\ \textrm{你好}_{10k} \end{matrix}$}};
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
\node [anchor=north] (label) at (o1.south) {\footnotesize{单词的one-hot表示}};
\visible<3->{
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{`桌子'},\textrm{`椅子'})=0$}};
}
\end{scope}

\visible<2->{
\begin{scope}[xshift=2in]
\node [anchor=north west] (o1) at (0,0) {\footnotesize{$\begin{bmatrix} .1 \\ -1 \\ 2 \\ ... \\ 0 \end{bmatrix}$}};
\node [anchor=north west] (o2) at ([xshift=1em]o1.north east) {\footnotesize{$\begin{bmatrix} 1 \\ 2 \\ .2 \\ ... \\ -1 \end{bmatrix}$}};
\node [anchor=north east] (v) at ([xshift=-0em]o1.north west) {\footnotesize{$\begin{matrix} \textrm{\ \ \ 属性}_1 \\ \textrm{\ \ \ 属性}_2 \\ \textrm{\ \ \ 属性}_3 \\ ... \\ \textrm{属性}_{512} \end{matrix}$}};
\node [anchor=south] (w1) at (o1.north) {\footnotesize{桌子}};
\node [anchor=south] (w2) at (o2.north) {\footnotesize{椅子}};
\node [anchor=north] (label) at ([yshift=-2em]o1.south) {\footnotesize{单词的分布式表示(词嵌入)}};
\visible<3->{
\node [anchor=south,fill=red!20!white] (cosine) at (w1.north) {\footnotesize{$\textrm{cosine}(\textrm{`桌子'},\textrm{`椅子'})=0.5$}};
}
\end{scope}
}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 分布式表示的优点
\begin{frame}{为什么需要分布式表示？}
\begin{itemize}
\item \textbf{一个自然的问题}：分布式表示中每一维都是什么意思
    \begin{itemize}
    \item 可以把每一维都理解为一个属性，比如：性别、身高等
    \item 但是，模型更多的是把一个维度看做是事物的一种``刻画''，是一种统计意义上的``语义''，而非人工归纳的属性
    \end{itemize}
\item<2-> 那这种方法有什么好处？
    \begin{itemize}
    \item 更容易刻画词语之间的\alert{相似性}
    \item 连续空间表示模型可以更准确的刻画客观事物，而不是非零即一的判断
    \end{itemize}
\item<2-> 预测下一个词任务
    \begin{itemize}
    \item 分布式表示很容易指导``桌子''和``椅子''是相似的
    \item 即使``椅子''没在这个句型中出现过，系统仍然可以通过它和``桌子''的相似性进行预测
    \end{itemize}
    \begin{tabular}{l | l}
    屋里 要 摆放 一个 \_\_\_\_\_ & 预测下个词 \\ \hline
    屋里 要 摆放 一个 \alert{桌子} & 见过 \\
    屋里 要 摆放 一个 \blue{椅子} & 没见过，但是仍然是合理预测
    \end{tabular}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 用实例理解词的分布式表示
\begin{frame}{分布式表示的可视化}
\begin{itemize}
\item \textbf{一个著名的例子}：国王 $\to$ 王后\\
    \begin{displaymath}
    \vv{\textrm{国王}} - \vv{\textrm{男人}} + \vv{\textrm{女人}} = \vv{\textrm{王后}}
    \end{displaymath}
    这里，$\vv{\textrm{word}}$表示单词的分布式向量表示
\item 更多的词的可视化：相似的词聚在一起
\end{itemize}
\begin{center}
\includegraphics[scale=0.4]{./Figures/word-graph.png}
\end{center}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 神经语言模型中的词嵌入
\begin{frame}{神经语言模型中的词嵌入}
\begin{itemize}
\item 在神经语言模型中，需要把词表示成它的分布式表示
    \begin{itemize}
    \item<2-> 其中$\textbf{C}$是词嵌入矩阵，每一行对应一个词的分布式表示
    \item<3-> $\textbf{C}$可以用语言模型训练，也可以利用其它模型训练，固定词嵌入，让语言模型专注长片段的学习
    \end{itemize}
\end{itemize}

\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=center,inner sep=2pt] (e) at (0,0) {\small{$e=w$}};
\node [anchor=west,inner sep=2pt] (c) at (e.east) {\small{$\textbf{C}$}};

\begin{pgfonlayer}{background}
\node [rectangle,inner sep=0.4em,draw,fill=blue!20!white] [fit = (e) (c)] (box) {};
\end{pgfonlayer}

\draw [->,thick] ([yshift=-1em]box.south)--([yshift=-0.1em]box.south) node [pos=0,below] (bottom1) {\small{单词$w$}};
\draw [->,thick] ([yshift=0.1em]box.north)--([yshift=1em]box.north) node [pos=1,above] (top1) {\scriptsize{$e$=(8,.2,-1,.9,...,1)}};
\node [anchor=north] (bottom2) at ([yshift=0.3em]bottom1.south) {\scriptsize{$w$=(0,0,1,0,...,0)}};
\node [anchor=south] (top2) at ([yshift=-0.3em]top1.north) {\small{$w$的分布式表示}};

\visible<2->{
\node [anchor=north west,fill=red!20!white] (cmatrix) at ([xshift=3em,yshift=1.0em]c.north east) {\scriptsize{$\begin{pmatrix} 1 & .2 & -.2 & 8 & ... & 0 \\ .6 & .8 & -2 & 1 & ... & -.2 \\ 8 & .2 & -1 & .9 & ... & 2.3 \\ 1 & 1.2 & -.9 & 3 & ... & .2 \\ ... & ... & ... & ... & ... & ... \\ 1 & .3 & 3 & .9 & ... & 5.1 \end{pmatrix}$}};
\node [anchor=west,inner sep=2pt,fill=red!30!white] (c) at (e.east) {\small{$\textbf{C}$}};
\draw [<-,thick] (c.east) -- ([xshift=3em]c.east);
}

\visible<3->{
\node [anchor=south,draw,fill=green!20!white] (e2) at ([yshift=1.5em]cmatrix.north) {\scriptsize{外部词嵌入系统得到的$\textbf{C}$}};
\draw [->,very thick,dashed] (e2.south) -- (cmatrix.north);
}

\end{scope}
\end{tikzpicture}
\end{center}

\vspace{-1.0em}

\begin{itemize}
\item<4-> 词嵌入如何学习得到？
    \begin{itemize}
    \item 可以和语言模型的其它部分一起训练，不过速度较慢
    \item 也可以考虑使用效率更高的外部模型，如word2vec、 Glove等，这样可以使用更大规模的数据
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
\subsection{句子表示模型及预训练}

%%%------------------------------------------------------------------------------------------------------------
%%% 词嵌入的问题
\begin{frame}{不仅如``词''}
\begin{itemize}
\item 词嵌入已经成为诸多NLP系统的标配，当然也衍生出各种花式玩法，甚至有``embed everything''的口号，但是词嵌入也有问题
    \begin{itemize}
    \item 每个词都对应唯一的向量表示，但是对于一词多义现象，词义需要通过上下文进行区分。一个著名的例子：
    \end{itemize}
    \vspace{0.3em}
    \hspace{6em} Jobs was the CEO of \alert{\underline{apple}}.\\
    \hspace{6em} He finally ate the \alert{\underline{apple}}.
\item<2-> 引入上下文信息
    \begin{itemize}
    \item 上述问题引发了新的思考：不能简单地考虑词的表示，应同时考虑其上下文信息
    \item 对于句子中的一个词(或者位置)，同时表示词和上下文
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\begin{scope}
\node [anchor=west] (node1) at (0,0) {\footnotesize{Jobs was the CEO of}};
\node [anchor=west] (node2) at ([xshift=-0.2em,yshift=-0.05em]node1.east) {\footnotesize{\alert{\underline{apple}}}};
\node [anchor=west] (node3) at ([xshift=-0.2em,yshift=-0.1em]node2.east) {\footnotesize{.}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=red!20!white] (node4) at ([yshift=1.5em]node2.north) {\scriptsize{词}};
\node [anchor=north] (label) at ([xshift=1em]node1.south) {\scriptsize{\textbf{词表示模型}}};
\draw [->,thick] (node2.north) -- (node4.south);
\end{scope}
\begin{scope}[xshift=2in]
\node [anchor=west] (node1) at (0,0) {\footnotesize{Jobs was the CEO of}};
\node [anchor=west] (node2) at ([xshift=-0.2em,yshift=-0.05em]node1.east) {\footnotesize{\alert{\underline{apple}}}};
\node [anchor=west] (node3) at ([xshift=-0.2em,yshift=-0.1em]node2.east) {\footnotesize{.}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=red!20!white] (node4) at ([yshift=1.5em]node2.north) {\scriptsize{词}};
\node [anchor=south,inner sep=2pt,minimum width=2.4em,fill=blue!20!white] (node5) at (node4.north) {\scriptsize{上下文}};
\node [anchor=north] (label) at ([xshift=1em]node1.south) {\scriptsize{\textbf{词+上下文表示模型}}};
\draw [->,thick] (node2.north) -- (node4.south);
\draw [->] ([xshift=1em]node1.north west) .. controls +(north:1) and +(west:2) .. ([yshift=0.2em]node5.west);
\draw [->] ([xshift=3em]node1.north west) .. controls +(north:0.8) and +(west:1.5) .. ([yshift=-0.2em]node5.west);
\node [anchor=east] (morelines) at ([xshift=-1.5em]node4.west) {...};
\end{scope}
\end{tikzpicture}
\end{center}
}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 上下文表示模型
\begin{frame}{表示更长的片段 - 上下文表示模型}
\begin{itemize}
\item 在语言模型中已经包含了每个位置的上下文表示信息
    \begin{itemize}
    \item 以RNN LM为例，位置$i$的隐层输出就是一种$w_1...w_i$的表示
    \end{itemize}
\end{itemize}

\vspace{-0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}
\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\scriptsize{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\scriptsize{RNN Cell}};

\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\scriptsize{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\scriptsize{embedding}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{乔布斯}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{任职}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{于}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{苹果}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\node [anchor=south,rnnnode] (node21) at ([yshift=1.5em]node11.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node22) at ([yshift=1.5em]node12.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node23) at ([yshift=1.5em]node13.north) {\scriptsize{RNN Cell}};
\node [anchor=south,rnnnode] (node24) at ([yshift=1.5em]node14.north) {\scriptsize{RNN Cell}};

\node [anchor=south] (node31) at ([yshift=1.0em]node21.north) {\scriptsize{的表示}};
\node [anchor=south west] (node31new) at ([yshift=-0.3em]node31.north west) {\scriptsize{``乔布斯''}};
\node [anchor=south] (node32) at ([yshift=1.0em]node22.north) {\scriptsize{的表示\ \ \ }};
\node [anchor=south west] (node32new) at ([yshift=-0.3em]node32.north west) {\scriptsize{``乔布斯 任职''}};
\node [anchor=south] (node33) at ([yshift=1.0em]node23.north) {\scriptsize{的表示\ \ \ \ \ \ \ \ }};
\node [anchor=south west] (node33new) at ([yshift=-0.3em]node33.north west) {\scriptsize{``乔布斯 任职 于''}};
\node [anchor=south] (node34) at ([yshift=1.0em]node24.north) {\scriptsize{的表示\ \ \ \ \ \ \ \ }};
\node [anchor=south west] (node34new) at ([yshift=-0.3em]node34.north west) {\scriptsize{``乔布斯 任职 于 苹果''}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]node31.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]node32.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]node33.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]node34.south);

\draw [->,thick] ([xshift=-1em]node21.west)--([xshift=-0.1em]node21.west);
\draw [->,thick] ([xshift=0.1em]node21.east)--([xshift=-0.1em]node22.west);
\draw [->,thick] ([xshift=0.1em]node22.east)--([xshift=-0.1em]node23.west);
\draw [->,thick] ([xshift=0.1em]node23.east)--([xshift=-0.1em]node24.west);
\draw [->,thick] ([xshift=0.1em]node24.east)--([xshift=1em]node24.east);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);

\visible<2->{
\node [anchor=south] (toplabel1) at ([yshift=2em,xshift=-2em]node32new.north) {\footnotesize{``苹果''的表示：}};
\node [anchor=west,fill=blue!20!white,minimum width=3em] (toplabel2) at (toplabel1.east) {\footnotesize{上下文}};
}
\visible<3->{
\node [anchor=west,fill=red!20!white,minimum width=3em] (toplabel3) at (toplabel2.east) {\footnotesize{词}};
}

\begin{pgfonlayer}{background}
\visible<3->{
\node [rectangle,inner sep=2pt,draw,thick,dashed,red] [fit = (e4)] (r2) {};
\draw [->,thick,red] (r2.west) .. controls +(west:0.8) and +(south:2) .. ([xshift=1.3em]toplabel3.south);
}
\visible<2->{
\node [rectangle,inner sep=2pt,draw,thick,dashed,ublue,fill=white] [fit = (node33) (node33new)] (r1) {};
\draw [->,thick,ublue] ([xshift=-2em]r1.north) .. controls +(north:0.7) and +(south:0.7) .. ([xshift=-0.5em]toplabel2.south);
}
\end{pgfonlayer}

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - ELMO
\begin{frame}{更强的表示模型 - ELMO}
\begin{itemize}
\item \textbf{ELMO}(Embedding from Language Models)可以说是掀起了基于语言模型的预训练的热潮
    \begin{itemize}
    \item 仍然使用RNN结构，不过循环单元换成了LSTM
    \item 同时考虑自左向右和自右向左的建模方式，同时表示一个词左端和右端的上下文
    \item 融合所有层的输出，送给下游应用，提供了更丰富的信息
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm0) at (0,0) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm1) at ([xshift=1em]Lstm0.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=0.5em]Lstm1.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm2) at ([xshift=0.5em]sep.east) {\scriptsize{LSTM}};

\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white] (Lstm3) at ([yshift=1em]Lstm0.north) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm4) at ([xshift=1em]Lstm3.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=0.5em]Lstm4.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm5) at ([xshift=0.5em]sep1.east) {\scriptsize{LSTM}};

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Lstm0) (Lstm2) (Lstm3) (Lstm5)] (inputshadow) {};

\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([xshift=-2em,yshift=-1em]Lstm2.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([xshift=1em]e1.east) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=west,inner sep=4pt] (sep5) at ([xshift=1em]e2.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([xshift=1em]sep5.east) {\scriptsize{$\textbf{e}_m$}};

\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([xshift=-2em,yshift=1em]Lstm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([xshift=1em]t1.east) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=west,inner sep=4pt] (sep6) at ([xshift=1em]t2.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([xshift=1em]sep6.east) {\scriptsize{$\textbf{h}_m$}};

\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm6) at ([xshift=1.5em]Lstm2.east) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm7) at ([xshift=1em]Lstm6.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep3) at ([xshift=0.5em]Lstm7.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm8) at ([xshift=0.5em]sep3.east) {\scriptsize{LSTM}};

\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white] (Lstm9) at ([yshift=1em]Lstm6.north) {\scriptsize{LSTM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm10) at ([xshift=1em]Lstm9.east) {\scriptsize{LSTM}};
\node [anchor=west,inner sep=4pt] (sep4) at ([xshift=0.5em]Lstm10.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white] (Lstm11) at ([xshift=0.5em]sep4.east) {\scriptsize{LSTM}};

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Lstm6) (Lstm8) (Lstm9) (Lstm11)] (inputshadow) {};

\draw [->] ([xshift=0.1em]Lstm0.east) -- ([xshift=-0.1em]Lstm1.west);
\draw [->] ([xshift=0.1em]Lstm1.east) -- ([xshift=0.1em]sep.west);
\draw [->] ([xshift=-0.1em]sep.east) -- ([xshift=-0.1em]Lstm2.west);

\draw [->] ([xshift=0.1em]Lstm3.east) -- ([xshift=-0.1em]Lstm4.west);
\draw [->] ([xshift=0.1em]Lstm4.east) -- ([xshift=0.1em]sep1.west);
\draw [->] ([xshift=-0.1em]sep1.east) -- ([xshift=-0.1em]Lstm5.west);

\draw [->] ([yshift=0.1em]Lstm0.north) -- ([yshift=-0.1em]Lstm3.south);
\draw [->] ([yshift=0.1em]Lstm1.north) -- ([yshift=-0.1em]Lstm4.south);
\draw [->] ([yshift=0.1em]Lstm2.north) -- ([yshift=-0.1em]Lstm5.south);

\draw [->] ([xshift=0.1em]Lstm6.east) -- ([xshift=-0.1em]Lstm7.west);
\draw [->] ([xshift=0.1em]Lstm7.east) -- ([xshift=0.1em]sep3.west);
\draw [->] ([xshift=-0.1em]sep3.east) -- ([xshift=-0.1em]Lstm8.west);

\draw [->] ([xshift=0.1em]Lstm9.east) -- ([xshift=-0.1em]Lstm10.west);
\draw [->] ([xshift=0.1em]Lstm10.east) -- ([xshift=0.1em]sep4.west);
\draw [->] ([xshift=-0.1em]sep4.east) -- ([xshift=-0.1em]Lstm11.west);

\draw [->] ([yshift=0.1em]Lstm6.north) -- ([yshift=-0.1em]Lstm9.south);
\draw [->] ([yshift=0.1em]Lstm7.north) -- ([yshift=-0.1em]Lstm10.south);
\draw [->] ([yshift=0.1em]Lstm8.north) -- ([yshift=-0.1em]Lstm11.south);

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Lstm6.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm1.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Lstm7.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Lstm8.south);

\draw [->] ([yshift=0.1em]Lstm3.north) -- ([xshift=-0.05em,yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Lstm9.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Lstm4.north) -- ([xshift=-0.05em,yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Lstm10.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Lstm5.north) -- ([xshift=-0.05em,yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Lstm11.north) -- ([yshift=-0.1em]t3.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - GTP
\begin{frame}{更强的表示模型 - GPT}
\begin{itemize}
\item \textbf{GPT}(Generative Pre-Training)也是一种基于语言模型的表示模型
    \begin{itemize}
    \item 架构换成了Transformer，特征抽取能力更强
    \item 基于Pre-training + Fine-tuning的框架，预训练作为下游系统部件的参数初始值，因此可以更好的适应目标任务
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm0) at (0,0) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm1) at ([xshift=1em]Trm0.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm2) at ([xshift=1em]Trm1.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm3) at ([xshift=1em]Trm2.east) {\scriptsize{TRM}};
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};

\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=1em]Trm8.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm9) at ([xshift=1em]sep1.east) {\scriptsize{TRM}};

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};

\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
\node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};

\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]Trm4.south);

\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm3.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm4.north) -- ([yshift=-0.1em]Trm9.south);

\draw [->] ([yshift=0.1em]Trm5.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Trm6.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Trm7.north) -- ([yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Trm8.north) -- ([yshift=-0.1em]t4.south);
\draw [->] ([yshift=0.1em]Trm9.north) -- ([yshift=-0.1em]t5.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 更强大的表示模型 - BERT
\begin{frame}{更强的表示模型 - BERT}
\begin{itemize}
\item \textbf{BERT}( Bidirectional Encoder Representations from Transformers)是最近非常火爆的表示模型
    \begin{itemize}
    \item 仍然基于Transformer但是考虑了左右两端的上下文(可以对比GPT)
    \item 使用了Mask方法来增加训练得到模型的健壮性，这个方法几乎成为了预训练表示模型的新范式
    \end{itemize}
\end{itemize}
\vspace{0.5em}
\begin{center}
\begin{tikzpicture}
\begin{scope}[scale=1.2]

\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm0) at (0,0) {\scriptsize{Trm}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm1) at ([xshift=1em]Trm0.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm2) at ([xshift=1em]Trm1.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm3) at ([xshift=1em]Trm2.east) {\scriptsize{TRM}};
\node [anchor=west,inner sep=4pt] (sep) at ([xshift=1em]Trm3.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm4) at ([xshift=1em]sep.east) {\scriptsize{TRM}};

\node [anchor=south,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm5) at ([yshift=1em]Trm0.north) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm6) at ([xshift=1em]Trm5.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm7) at ([xshift=1em]Trm6.east) {\scriptsize{TRM}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm8) at ([xshift=1em]Trm7.east) {\scriptsize{TRM}};
\node [anchor=west,inner sep=4pt] (sep1) at ([xshift=1em]Trm8.east) {\scriptsize{...}};
\node [anchor=west,draw,inner sep=4pt,fill=blue!20!white,minimum width=3em] (Trm9) at ([xshift=1em]sep1.east) {\scriptsize{TRM}};

\node [rectangle,rounded corners,draw=black!50,densely dashed,inner sep=0.4em] [fit = (Trm0) (Trm4) (Trm5) (Trm9)] (inputshadow) {};

\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e1) at ([yshift=-1em]Trm0.south) {\scriptsize{$\textbf{e}_1$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e2) at ([yshift=-1em]Trm1.south) {\scriptsize{$\textbf{e}_2$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e3) at ([yshift=-1em]Trm2.south) {\scriptsize{$\textbf{e}_3$}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e4) at ([yshift=-1em]Trm3.south) {\scriptsize{$\textbf{e}_4$}};
\node [anchor=north,inner sep=4pt] (sep5) at ([yshift=-1em]sep.south) {\scriptsize{...}};
\node [anchor=north,draw,inner sep=4pt,fill=ugreen!20!white,minimum width=2em] (e5) at ([yshift=-1em]Trm4.south) {\scriptsize{$\textbf{e}_m$}};

\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t1) at ([yshift=1em]Trm5.north) {\scriptsize{$\textbf{h}_1$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t2) at ([yshift=1em]Trm6.north) {\scriptsize{$\textbf{h}_2$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t3) at ([yshift=1em]Trm7.north) {\scriptsize{$\textbf{h}_3$}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t4) at ([yshift=1em]Trm8.north) {\scriptsize{$\textbf{h}_4$}};
\node [anchor=south,inner sep=4pt] (sep6) at ([yshift=1em]sep1.north) {\scriptsize{...}};
\node [anchor=south,draw,inner sep=4pt,fill=yellow!30,minimum width=2em] (t5) at ([yshift=1em]Trm9.north) {\scriptsize{$\textbf{h}_m$}};

\node [anchor=west,draw,inner sep=3pt,fill=blue!20!white,minimum width=1em] (Lt1) at ([yshift=1.5em]t1.west) {\tiny{TRM}};
\node [anchor=west] (Lt2) at ([xshift=-0.1em]Lt1.east) {\tiny{: Transformer}};

\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e1.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e2.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm0.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm1.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm2.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e3.north) -- ([yshift=-0.1em]Trm4.south);
\draw [->] ([yshift=0.1em]e4.north) -- ([yshift=-0.1em]Trm3.south);
\draw [->] ([yshift=0.1em]e5.north) -- ([yshift=-0.1em]Trm4.south);

\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm0.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm1.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm5.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm6.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm7.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm2.north) -- ([yshift=-0.1em]Trm9.south);
\draw [->] ([yshift=0.1em]Trm3.north) -- ([yshift=-0.1em]Trm8.south);
\draw [->] ([yshift=0.1em]Trm4.north) -- ([yshift=-0.1em]Trm9.south);

\draw [->] ([yshift=0.1em]Trm5.north) -- ([yshift=-0.1em]t1.south);
\draw [->] ([yshift=0.1em]Trm6.north) -- ([yshift=-0.1em]t2.south);
\draw [->] ([yshift=0.1em]Trm7.north) -- ([yshift=-0.1em]t3.south);
\draw [->] ([yshift=0.1em]Trm8.north) -- ([yshift=-0.1em]t4.south);
\draw [->] ([yshift=0.1em]Trm9.north) -- ([yshift=-0.1em]t5.south);

\end{scope}
\end{tikzpicture}
\end{center}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 预训练
\begin{frame}{预训练}
\begin{itemize}
\item 语言模型可以使用大量无标注数据进行训练，得到的模型可以被直接用于下游系统，以序列到序列任务为例

\begin{center}
\begin{tikzpicture}
\node [anchor=south,minimum width=17em,fill=red!20!white] (encoder) at (0,0) {Encoder (语言模型预先训练)};
\node [anchor=south,minimum width=17em,fill=blue!20!white] (decoder) at (encoder.north) {Decoder (目标任务正常训练)};
\end{tikzpicture}
\end{center}

\item<2-> 衍生出了非常火爆的\alert{范式}：大规模语言模型pre-training + 目标任务fine-tuning
	\begin{itemize}
	\item 许多NLP任务都可以被描述为语言建模，在外部训练得到的语言模型作为模块放入目标系统中(参数初始化)
	\end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}

\begin{scope}

\node [anchor=west,draw,thick,minimum width=4em,minimum height=1.7em,fill=blue!20] (encoder) at (0,0) {模块};
\node [anchor=south,minimum width=4em,minimum height=1.7em] (space) at ([yshift=0.3em]encoder.north) {\footnotesize{目标系统}};

\begin{pgfonlayer}{background}
\node [rectangle,draw,thick,fill=red!20] [fit = (encoder) (space)] (system) {};
\end{pgfonlayer}

\node [anchor=north] (data) at ([yshift=-1em]system.south) {\scriptsize{\textbf{目标任务有标注数据}}};
\draw [->,thick] (data.north) -- ([yshift=-0.1em]system.south);
\node [anchor=north] (label) at ([yshift=-0em]data.south) {\scriptsize{(a) standard method}};

\end{scope}

\begin{scope}[xshift=2.8in]

\node [anchor=west,draw,dashed,thick,minimum width=4em,minimum height=1.7em,fill=blue!20] (encoder) at (0,0) {模块};
\node [anchor=south,minimum width=4em,minimum height=1.7em] (space) at ([yshift=0.3em]encoder.north) {\footnotesize{目标系统}};
\node [anchor=center,draw,thick,minimum width=4em,minimum height=1.7em,fill=green!20] (encoderpre) at ([xshift=-7em]encoder.center) {\footnotesize{语言模型}};
\draw [->,thick] (encoderpre.east) -- (encoder.west);

\begin{pgfonlayer}{background}
\node [rectangle,draw,thick,fill=red!20] [fit = (encoder) (space)] (system) {};
\end{pgfonlayer}

\node [anchor=north] (data) at ([yshift=-1em]system.south) {\scriptsize{\textbf{目标任务有标注数据}}};
\draw [->,thick] (data.north) -- ([yshift=-0.1em]system.south);
\node [anchor=north] (data2) at ([yshift=-1em,xshift=-7em]system.south) {\scriptsize{\textbf{大规模无标注数据}}};
\draw [->,thick] (data2.north) -- ([yshift=-0.1em]encoderpre.south);
\node [anchor=north] (label) at ([yshift=-0em,xshift=-4em]data.south) {\scriptsize{(b) pre-training + fine-tuning}};

\end{scope}

\end{tikzpicture}
\end{center}
}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 预训练带来的新思路
\begin{frame}{预训练带来的新思路}
\begin{itemize}
\item 预训练模型刷榜各种任务的同时，引发了一些思考：\\
      预训练究竟给我们带来了什么？
    \begin{itemize}
    \item 有标注数据量有限，预训练提供使用超大规模数据的方法
    \item 从大规模无标注数据中学习通用知识，提升泛化能力
    \item 神经网络复杂且不容易训练，预训练可以使模型关注优质解的高密度区域
    \end{itemize}
\end{itemize}

\visible<2->{
\begin{center}
\begin{tikzpicture}
\draw[name path=ellipse,thick] (0,0) circle[x radius = 2, y radius = 1];
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p1) at (0.2,0.5) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p2) at (0.3,0.6) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p3) at (0.1,-0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p4) at (0.4,0) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p5) at (0.5,0.3) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p6) at (0.6,0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p7) at (0.7,-0.1) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p8) at (-1.2,0.4) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p9) at (-1.0,-0.3) {};
\node[rectangle,minimum size=0.1em,inner sep=2pt,fill=red] (p10) at (-0.1,-0.8) {};

\begin{pgfonlayer}{background}
\visible<4->{
\node [rectangle,inner sep=0.4em,draw,blue] [fit = (p1) (p2) (p3) (p4) (p5) (p6)] (area) {};
}
\end{pgfonlayer}

\draw [->] (2.5,-0.7) -- (1.8,-0.5) node [pos=0,right] {\scriptsize{模型参数解空间}};

\visible<4->{
\draw [->] (2.0,0.7) -- (area.20) node [pos=0,right] {\scriptsize{优质解高密度区域(预训练)}};
}
\visible<3->{
\draw [->] (-2.0,0.7) -- (p8.west) node [pos=0,left] {\scriptsize{游离的解}};
}

\end{tikzpicture}
\end{center}
}

\begin{itemize}
\item<5-> 机器翻译中的预训练
    \begin{itemize}
    \item 机器翻译中预训练还没有屠榜，一方面由于很多机器翻译任务训练数据量并不小，另一方面也反应出翻译的双语建模对预训练也提出了新的要求
    \end{itemize}
\end{itemize}

\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% 总结
\begin{frame}{总结 - 长出一口气}
\begin{itemize}
\item 讲了很多，累呀累，再整理一下主要观点
    \begin{itemize}
    \item 神经网络没有那么复杂，入门并不难
    \item 简单的网络结构可以组合成强大的模型
    \item 语言模型可以用神经网络实现，效果很好，最近出现的预训练等范式证明了神经语言模型的潜力
    \end{itemize}
\item<2-> 仍然有很多问题需要讨论
    \begin{itemize}
    \item 常见的神经网络结构(面向NLP)\\
          google一下LSTM、GRU、CNN
    \item 深层模型和训练方法。深度学习如何体现``深''?\\
          深层网络可以带来什么？\\
          如何有效的训练深层模型？
    \item 如何把神经网络用于包括机器翻译在内的其它NLP任务？\\
          比如encoder-decoder框架
    \item 深度学习的实践技巧\\
          ``炼金术''了解下，因为不同任务调参和模型设计都有技巧\\
          ...
    \end{itemize}
\end{itemize}
\end{frame}

%%%------------------------------------------------------------------------------------------------------------
%%% last slide
\begin{frame}{又结束一章内容~}

\vspace{2em}

\begin{center}
\textbf{内容很多，开个了个头}\\
\textbf{学习深度学习技术需要实践和经验的积累！}

\vspace{2em}

\begin{tikzpicture}

\tikzstyle{rnnnode} = [draw,inner sep=5pt,minimum width=4em,minimum height=1.5em,fill=green!30!white,blur shadow={shadow xshift=1pt,shadow yshift=-1pt}]
\node [anchor=west,rnnnode] (node11) at (0,0) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node12) at ([xshift=2em]node11.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node13) at ([xshift=2em]node12.east) {\tiny{RNN Cell}};
\node [anchor=west,rnnnode] (node14) at ([xshift=2em]node13.east) {\tiny{RNN Cell}};

\node [anchor=north,rnnnode,fill=red!30!white] (e1) at ([yshift=-1.2em]node11.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e2) at ([yshift=-1.2em]node12.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e3) at ([yshift=-1.2em]node13.south) {\tiny{embedding}};
\node [anchor=north,rnnnode,fill=red!30!white] (e4) at ([yshift=-1.2em]node14.south) {\tiny{embedding}};
\node [anchor=north] (w1) at ([yshift=-1em]e1.south) {\footnotesize{$<$s$>$}};
\node [anchor=north] (w2) at ([yshift=-1em]e2.south) {\footnotesize{谢谢}};
\node [anchor=north] (w3) at ([yshift=-1em]e3.south) {\footnotesize{大家}};
\node [anchor=north] (w4) at ([yshift=-1em]e4.south) {\footnotesize{聆听}};

\draw [->,thick] ([yshift=0.1em]w1.north)--([yshift=-0.1em]e1.south);
\draw [->,thick] ([yshift=0.1em]w2.north)--([yshift=-0.1em]e2.south);
\draw [->,thick] ([yshift=0.1em]w3.north)--([yshift=-0.1em]e3.south);
\draw [->,thick] ([yshift=0.1em]w4.north)--([yshift=-0.1em]e4.south);

\draw [->,thick] ([yshift=0.1em]e1.north)--([yshift=-0.1em]node11.south);
\draw [->,thick] ([yshift=0.1em]e2.north)--([yshift=-0.1em]node12.south);
\draw [->,thick] ([yshift=0.1em]e3.north)--([yshift=-0.1em]node13.south);
\draw [->,thick] ([yshift=0.1em]e4.north)--([yshift=-0.1em]node14.south);

\node [anchor=south,rnnnode,fill=red!30!white] (node21) at ([yshift=1.0em]node11.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node22) at ([yshift=1.0em]node12.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node23) at ([yshift=1.0em]node13.north) {\tiny{Softmax($\cdot$)}};
\node [anchor=south,rnnnode,fill=red!30!white] (node24) at ([yshift=1.0em]node14.north) {\tiny{Softmax($\cdot$)}};

\node [anchor=south] (output1) at ([yshift=1em]node21.north) {\Large{\textbf{谢谢}}};
\node [anchor=south] (output2) at ([yshift=1em]node22.north) {\Large{\textbf{大家}}};
\node [anchor=south] (output3) at ([yshift=1em]node23.north) {\Large{\textbf{聆听}}};
\node [anchor=south] (output4) at ([yshift=1em]node24.north) {\Large{\textbf{$<$/s$>$}}};

\draw [->,thick] ([yshift=0.1em]node21.north)--([yshift=-0.1em]output1.south);
\draw [->,thick] ([yshift=0.1em]node22.north)--([yshift=-0.1em]output2.south);
\draw [->,thick] ([yshift=0.1em]node23.north)--([yshift=-0.1em]output3.south);
\draw [->,thick] ([yshift=0.1em]node24.north)--([yshift=-0.1em]output4.south);

\draw [->,thick] ([yshift=0.1em]node11.north)--([yshift=-0.1em]node21.south);
\draw [->,thick] ([yshift=0.1em]node12.north)--([yshift=-0.1em]node22.south);
\draw [->,thick] ([yshift=0.1em]node13.north)--([yshift=-0.1em]node23.south);
\draw [->,thick] ([yshift=0.1em]node14.north)--([yshift=-0.1em]node24.south);

\draw [->,thick] ([xshift=-1em]node11.west)--([xshift=-0.1em]node11.west);
\draw [->,thick] ([xshift=0.1em]node11.east)--([xshift=-0.1em]node12.west);
\draw [->,thick] ([xshift=0.1em]node12.east)--([xshift=-0.1em]node13.west);
\draw [->,thick] ([xshift=0.1em]node13.east)--([xshift=-0.1em]node14.west);
\draw [->,thick] ([xshift=0.1em]node14.east)--([xshift=1em]node14.east);

\end{tikzpicture}

\end{center}

\end{frame}

\end{CJK}
\end{document}