Commit ef2c73d1 by 孟霞

合并分支 'caorunzhe' 到 'mengxia'

Caorunzhe

查看合并请求 !898
parents c4d60e0e 131763e6
...@@ -325,11 +325,11 @@ ...@@ -325,11 +325,11 @@
\begin{itemize} \begin{itemize}
\vspace{0.5em} \vspace{0.5em}
\item 首先,将$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$分别通过线性(Linear)变换的方式映射为$h$个子集。即$\mathbi{Q}_i = \mathbi{Q}\mathbi{W}_i^{\,Q} $$\mathbi{K}_i = \mathbi{K}\mathbi{W}_i^{\,K} $$\mathbi{V}_i = \mathbi{V}\mathbi{W}_i^{\,V} $,其中$i$表示第$i$个头, $\mathbi{W}_i^{\,Q} \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^{\,K} \in \mathbb{R}^{d_{model} \times d_k}$, $\mathbi{W}_i^{\,V} \in \mathbb{R}^{d_{model} \times d_v}$是参数矩阵; $d_k=d_v=d_{model} / h$,对于不同的头采用不同的变换矩阵,这里$d_{model}$表示每个隐层向量的维度; \item 首先,将$\mathbi{Q}$$\mathbi{K}$$\mathbi{V}$分别通过线性(Linear)变换的方式映射为$h$个子集。即$\mathbi{Q}_i = \mathbi{Q}\mathbi{W}_i^{\,Q} $$\mathbi{K}_i = \mathbi{K}\mathbi{W}_i^{\,K} $$\mathbi{V}_i = \mathbi{V}\mathbi{W}_i^{\,V} $,其中$i$表示第$i$个头, $\mathbi{W}_i^{\,Q} \in \mathbb{R}^{d_{\textrm{model}} \times d_k}$, $\mathbi{W}_i^{\,K} \in \mathbb{R}^{d_{\textrm{model}} \times d_k}$, $\mathbi{W}_i^{\,V} \in \mathbb{R}^{d_{\textrm{model}} \times d_v}$是参数矩阵; $d_k=d_v=d_{\textrm{model}} / h$,对于不同的头采用不同的变换矩阵,这里$d_{\textrm{model}}$表示每个隐层向量的维度;
\vspace{0.5em} \vspace{0.5em}
\item 其次,对每个头分别执行点乘注意力操作,并得到每个头的注意力操作的输出$\mathbi{head}_i$ \item 其次,对每个头分别执行点乘注意力操作,并得到每个头的注意力操作的输出$\mathbi{head}_i$
\vspace{0.5em} \vspace{0.5em}
\item 最后,将$h$个头的注意力输出在最后一维$d_v$进行拼接(Concat)重新得到维度为$hd_v$的输出,并通过对其右乘一个权重矩阵$\mathbi{W}^{\,o}$进行线性变换,从而对多头计算得到的信息进行融合,且将多头注意力输出的维度映射为模型的隐层大小(即$d_{model}$),这里参数矩阵$\mathbi{W}^{\,o} \in \mathbb{R}^{h d_v \times d_{model}}$ \item 最后,将$h$个头的注意力输出在最后一维$d_v$进行拼接(Concat)重新得到维度为$hd_v$的输出,并通过对其右乘一个权重矩阵$\mathbi{W}^{\,o}$进行线性变换,从而对多头计算得到的信息进行融合,且将多头注意力输出的维度映射为模型的隐层大小(即$d_{\textrm{model}}$),这里参数矩阵$\mathbi{W}^{\,o} \in \mathbb{R}^{h d_v \times d_{\textrm{model}}}$
\vspace{0.5em} \vspace{0.5em}
\end{itemize} \end{itemize}
......
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
\node(process_2)[process,fill=blue!20,right of = process_1,xshift=7.0cm,text width=4cm,align=center]{\baselineskip=4pt\LARGE{[[0.2,...,0.3], \qquad ..., \qquad 0.3,...,0.5]]}\par}; \node(process_2)[process,fill=blue!20,right of = process_1,xshift=7.0cm,text width=4cm,align=center]{\baselineskip=4pt\LARGE{[[0.2,...,0.3], \qquad ..., \qquad 0.3,...,0.5]]}\par};
\node(text_2)[below of = process_2,yshift=-2cm,scale=1.5]{语音特征}; \node(text_2)[below of = process_2,yshift=-2cm,scale=1.5]{语音特征};
\node(process_3)[process,fill=orange!20,minimum width=6cm,minimum height=5cm,right of = process_2,xshift=8.2cm,text width=4cm,align=center]{}; \node(process_3)[process,fill=orange!20,minimum width=6cm,minimum height=5cm,right of = process_2,xshift=8.2cm,text width=4cm,align=center]{};
\node(text_3)[below of = process_3,yshift=-3cm,scale=1.5]{源语文本及其词格}; \node(text_3)[below of = process_3,yshift=-3cm,scale=1.5]{源语文本及其词格};
\node(cir_s)[cir,very thick, below of = process_3,xshift=-2.2cm,yshift=1.1cm]{\LARGE S}; \node(cir_s)[cir,very thick, below of = process_3,xshift=-2.2cm,yshift=1.1cm]{\LARGE S};
\node(cir_a)[cir,right of = cir_s,xshift=1cm,yshift=0.8cm]{\LARGE a}; \node(cir_a)[cir,right of = cir_s,xshift=1cm,yshift=0.8cm]{\LARGE a};
\node(cir_c)[cir,right of = cir_a,xshift=1.2cm,yshift=0cm]{\LARGE c}; \node(cir_c)[cir,right of = cir_a,xshift=1.2cm,yshift=0cm]{\LARGE c};
......
...@@ -229,7 +229,6 @@ ...@@ -229,7 +229,6 @@
%---------------------------------------------- %----------------------------------------------
\begin{itemize} \begin{itemize}
\vspace{0.5em}
\item 第一部分:对每个$i\in[1,l]$的目标语单词的产出率建模({\color{red!70} 红色}),即$\varphi_i$的生成概率。它依赖于$\seq{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$\footnote{这里约定,当$i=1$ 时,$\varphi_1^0$ 表示空。} \item 第一部分:对每个$i\in[1,l]$的目标语单词的产出率建模({\color{red!70} 红色}),即$\varphi_i$的生成概率。它依赖于$\seq{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^{i-1}$\footnote{这里约定,当$i=1$ 时,$\varphi_1^0$ 表示空。}
\vspace{0.5em} \vspace{0.5em}
\item 第二部分:对$i=0$时的产出率建模({\color{blue!70} 蓝色}),即空标记$t_0$的产出率生成概率。它依赖于$\seq{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$ \item 第二部分:对$i=0$时的产出率建模({\color{blue!70} 蓝色}),即空标记$t_0$的产出率生成概率。它依赖于$\seq{t}$和区间$[1,i-1]$的目标语单词的产出率$\varphi_1^l$
...@@ -248,7 +247,7 @@ ...@@ -248,7 +247,7 @@
\subsection{IBM 模型3} \subsection{IBM 模型3}
\parinterval IBM模型3通过一些假设对图\ref{fig:6-7}所表示的基本模型进行了化简。具体来说,对于每个$i\in[1,l]$,假设$\funp{P}(\varphi_i |\varphi_1^{i-1},\seq{t})$仅依赖于$\varphi_i$$t_i$$\funp{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_1^{i-1},\tau_0^l,\varphi_0^l,\seq{t})$仅依赖于$\pi_{ik}$$i$$m$$l$。而对于所有的$i\in[0,l]$,假设$\funp{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_1^{i-1},\varphi_0^l,\seq{t})$仅依赖于$\tau_{ik}$$t_i$。这些假设的形式化描述为: \parinterval IBM模型3通过一些假设对图\ref{fig:6-7}所表示的基本模型进行了化简。具体来说,对于每个$i\in[1,l]$,假设$\funp{P}(\varphi_i |\varphi_1^{i-1},\seq{t})$仅依赖于$\varphi_i$$t_i$$\funp{P}(\pi_{ik}|\pi_{i1}^{k-1},\pi_1^{i-1},\tau_0^l,\varphi_0^l,\seq{t})$仅依赖于$\pi_{ik}$$i$$m$$l$。而对于所有的$i\in[0,l]$,假设$\funp{P}(\tau_{ik}|\tau_{i1}^{k-1},\tau_1^{i-1},\varphi_0^l,\seq{t})$仅依赖于$\tau_{ik}$$t_i$。这些假设的形式化描述为:
\vspace{-0.5em}
\begin{eqnarray} \begin{eqnarray}
\funp{P}(\varphi_i|\varphi_1^{i-1},\seq{t}) & = &{\funp{P}(\varphi_i|t_i)} \label{eq:6-10} \\ \funp{P}(\varphi_i|\varphi_1^{i-1},\seq{t}) & = &{\funp{P}(\varphi_i|t_i)} \label{eq:6-10} \\
\funp{P}(\tau_{ik} = s_j |\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_0^t,\seq{t}) & = & t(s_j|t_i) \label{eq:6-11} \\ \funp{P}(\tau_{ik} = s_j |\tau_{i1}^{k-1},\tau_{1}^{i-1},\varphi_0^t,\seq{t}) & = & t(s_j|t_i) \label{eq:6-11} \\
...@@ -265,7 +264,6 @@ ...@@ -265,7 +264,6 @@
\end{eqnarray} \end{eqnarray}
否则 否则
\begin{eqnarray} \begin{eqnarray}
\funp{P}(\pi_{0k}=j|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\seq{t}) & = & 0 \funp{P}(\pi_{0k}=j|\pi_{01}^{k-1},\pi_1^l,\tau_0^l,\varphi_0^l,\seq{t}) & = & 0
\label{eq:6-14} \label{eq:6-14}
...@@ -308,7 +306,6 @@ m-\varphi_0\\ ...@@ -308,7 +306,6 @@ m-\varphi_0\\
p_0+p_1 & = & 1 \label{eq:6-21} p_0+p_1 & = & 1 \label{eq:6-21}
\end{eqnarray} \end{eqnarray}
} }
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
% NEW SUB-SECTION % NEW SUB-SECTION
%---------------------------------------------------------------------------------------- %----------------------------------------------------------------------------------------
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论