%%% 注意力机制
%%% 注意力机制的简单示例
\item 这里$C_i$表示第$i$个目标语单词所使用的源语表示
\foreach \x in {1,2,...,6}
\node[] (s\x) at (\x * \step,0) {};
\node[] (s\x) at (\x * \mystep,0) {};
\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
\node [] (ws6) at (s6) {\scriptsize{句子}};
\foreach \x in {1,2,...,6}
\node[] (t\x) at (\x * \step + 2.4in,0) {};
\node[] (t\x) at (\x * \mystep + 2.4in,0) {};
\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
\foreach \x in {1,2,...,5}
\draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
%%% 模型结构
%%% 简单的编码-解码框架的问题
\item 将源语言句子编码为一个实数向量确实很神奇,但是也有明显问题
\item 整个句子编码到一个向量里可能会有信息丢失
\item 缺少源语单词与目标语单词之间的对应。某种意义上讲,一个目标语单词的生成无法区分不同源语单词的贡献
\item<2-> 但是,翻译是具有很强的\alert{局部性}的,有些词之间会有更紧密的关系
\item 源语词和目标语词的对应并不是均匀的,甚至非常稀疏
\item 比如,一些短语的生成仅依赖于源文中的少数词
\item<3-> 这些关系可以在表示模型中考虑
\foreach \x in {1,2,...,6}
\node[] (s\x) at (\x * \mystep,0) {};
\node [] (ws1) at (s1) {\scriptsize{}};
\node [] (ws2) at (s2) {\scriptsize{}};
\node [] (ws3) at (s3) {\scriptsize{}};
\node [] (ws4) at (s4) {\scriptsize{很长}};
\node [] (ws5) at (s5) {\scriptsize{}};
\node [] (ws6) at (s6) {\scriptsize{句子}};
\foreach \x in {1,2,...,6}
\node[] (t\x) at (\x * \mystep + 2.4in,0) {};
\node [] (wt1) at (t1) {\scriptsize{This}};
\node [] (wt2) at (t2) {\scriptsize{is}};
\node [] (wt3) at ([yshift=-1pt]t3) {\scriptsize{a}};
\node [] (wt4) at ([yshift=-0.1em]t4) {\scriptsize{very}};
\node [] (wt5) at (t5) {\scriptsize{long}};
\node [] (wt6) at ([xshift=1em]t6) {\scriptsize{sentence}};
\node [anchor=south west,fill=red!30,minimum width=1.6in,minimum height=1.5em] (encoder) at ([yshift=1.0em]ws1.north west) {\footnotesize{Encoder}};
\node [anchor=west,fill=blue!30,minimum width=1.9in,minimum height=1.5em] (decoder) at ([xshift=4.5em]encoder.east) {\footnotesize{Decoder}};
\node [anchor=west,fill=green!30,minimum height=1.5em] (representation) at ([xshift=1em]encoder.east) {\footnotesize{表示}};
\draw [->,thick] ([xshift=1pt]encoder.east)--([xshift=-1pt]representation.west);
\draw [->,thick] ([xshift=1pt]representation.east)--([xshift=-1pt]decoder.west);
\foreach \x in {1,2,...,6}
\draw[->] ([yshift=0.1em]s\x.north) -- ([yshift=1.2em]s\x.north);
\foreach \x in {1,2,...,5}
\draw[<-] ([yshift=0.1em]t\x.north) -- ([yshift=1.2em]t\x.north);
\draw[<-] ([yshift=0.1em,xshift=1em]t6.north) -- ([yshift=1.2em,xshift=1em]t6.north);
\draw [<->,ublue,thick] ([xshift=0.3em]ws4.south) .. controls +(-60:1) and +(south:1) .. (wt4.south);
\draw [<->,ublue,thick] (ws4.south) .. controls +(south:1.0) and +(south:1.5) .. (wt5.south);
\node [anchor=north,fill=green!30,draw=ublue] (attentionlabel) at ([yshift=-3.4em]representation.south) {\footnotesize{词语的关注度}};
\draw [->,dotted,very thick,ublue] ([yshift=0.1em]attentionlabel.north)--([yshift=-0.1em]representation.south);
%%% 注意力机制
\item 关注的``局部性''在图像处理、语音识别等领域也有广泛讨论,比如,对于下图
\item 关注的顺序:大狗的帽子 $\to$ 大狗 $\to$ 小狗的帽子 $\to$ 小狗
\item 人往往不是``均匀地''看图像中的所有位置,翻译是一个道理,生成一个目标语单词时参考的源语单词不会太多
\item<2-> \alert{注意力机制}在机器翻译中已经成功应用,经典的论文\\
\textbf{Neural Machine Translation by Jointly Learning to Align and Translate}\\
\textbf{Bahdanau et al., 2015, In Proc of ICLR}
