Commit 7f9d37b8 by 孟霞

合并分支 'caorunzhe' 到 'mengxia'

Caorunzhe

查看合并请求 !728
parents abd366de 0953c883
......@@ -20,11 +20,11 @@
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){STLoss};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Target Text\\(Embedding)};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){标注文本\\编码表示};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
......@@ -47,4 +47,6 @@
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\end{tikzpicture}
\ No newline at end of file
\tikzstyle{coder} = [rectangle,rounded corners,minimum height=2.2em,minimum width=4.3em,text centered,draw=black,fill=red!25]
\begin{tikzpicture}[node distance = 0,scale = 1]
\tikzstyle{every node}=[scale=1]
\begin{tikzpicture}[node distance = 0,scale = 0.75]
\tikzstyle{every node}=[scale=0.75]
\node(x)[]{x};
\node(encoder)[coder, above of = x,yshift=4em]{{编码器}};
\node(decoder_left)[coder, above of = encoder, yshift=6em,fill=ugreen!25]{{解码器}};
\node(decoder_left)[coder, above of = encoder, yshift=6em,fill=blue!25]{{解码器}};
\node(y_hat)[above of = decoder_left, yshift=4em]{{$\rm y'$}};
\node(y)[above of = decoder_left, xshift=-6em]{{$\rm y$}};
\node(decoder_right)[coder, above of = encoder, xshift=11em,fill=yellow!25]{{解码器}};
......@@ -11,9 +11,9 @@
\node(figure)[draw=white,above of = decoder_right,yshift=6.5em,scale=0.25] {\includegraphics[width=0.62\textwidth]{./Chapter17/Figures/figure-bank-without-attention.png}};
\draw[->,thick](x)to(encoder);
\draw[->,thick](encoder)to(decoder_left)node[right,xshift=-0.1cm,yshift=-1.25cm,scale=1.0]{翻译};
\draw[->,thick](encoder)to(decoder_left)node[right,xshift=-0.1cm,yshift=-1.25cm,scale=1.2]{\small{翻译}};
\draw[->,thick](decoder_left)to(y_hat);
\draw[->,thick](y)to(decoder_left);
\draw[->,thick](encoder)to(decoder_right)node[left,xshift=-3.1em,yshift=0.25cm,scale=1.0]{生成图片};
\draw[->,thick](encoder)to(decoder_right)node[left,xshift=-3.1em,yshift=0.25cm,scale=1.2]{\small{生成图片}};
\draw[->,thick](decoder_right)to(figure);
\end{tikzpicture}
\ No newline at end of file
......@@ -21,7 +21,7 @@
\draw [very thick,rounded corners=10pt]([xshift=-2.2cm,yshift=-1cm]process_1.center)--([xshift=-1.8cm,yshift=1cm]process_1.center)--([xshift=-1.4cm,yshift=0cm]process_1.center)--([xshift=-1.1cm,yshift=0.8cm]process_1.center)--([xshift=-0.8cm,yshift=-0.4cm]process_1.center)--([xshift=-0.5cm,yshift=0.4cm]process_1.center);
\draw [->,very thick]([xshift=-0.3cm]process_1.center)to([xshift=0.3cm]process_1.center);
\draw [very thick,rounded corners=10pt,densely dotted]([xshift=0.5cm,yshift=-1cm]process_1.center)--([xshift=0.9cm,yshift=1cm]process_1.center)--([xshift=1.3cm,yshift=0cm]process_1.center)--([xshift=1.6cm,yshift=0.8cm]process_1.center)--([xshift=1.9cm,yshift=-0.4cm]process_1.center)--([xshift=2.2cm,yshift=0.4cm]process_1.center);
\node(process_2)[process,fill=ugreen!20,right of = process_1,xshift=6.6cm]{};
\node(process_2)[process,fill=blue!20,right of = process_1,xshift=6.6cm]{};
\node(text_2)[below of = process_2,yshift=-2cm,scale=1.3]{量化};
\draw [very thick,rounded corners=10pt,densely dotted]([xshift=-2.2cm,yshift=-1cm]process_2.center)--([xshift=-1.8cm,yshift=1cm]process_2.center)--([xshift=-1.4cm,yshift=0cm]process_2.center)--([xshift=-1.1cm,yshift=0.8cm]process_2.center)--([xshift=-0.8cm,yshift=-0.4cm]process_2.center)--([xshift=-0.5cm,yshift=0.4cm]process_2.center);
\draw [->,very thick]([xshift=-0.3cm]process_2.center)to([xshift=0.3cm]process_2.center);
......
......@@ -36,11 +36,11 @@
\node[draw,thick,rounded corners=2pt,densely dashed,inner ysep=1.2em,inner xsep=0.4em,label={above:图像特征向量}][fit=(h2_1)(h2_2)(h2_n)](box2){};
\end{pgfonlayer}
\node[anchor=west,draw,rounded corners=2pt,fill=red!20,minimum height=2.2em,minimum width=4.3em] (decoder1)at ([xshift=3em]box1.east){解码器};
\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder1)at ([xshift=6em]box1.east){解码器};
\node[anchor=west,draw,circle,inner sep=0pt,minimum size=1.4em] (add)at ([xshift=2em,yshift=1.6em]box2.east){};
\draw[] (add.0) -- (add.180);
\draw[] (add.90) -- (add.-90);
\node[anchor=west,draw,rounded corners=2pt,fill=red!20,minimum height=2.2em,minimum width=4.3em] (decoder2)at ([xshift=6em]box2.east){解码器};
\node[anchor=west,draw,rounded corners=2pt,fill=blue!20,minimum height=2.2em,minimum width=4.3em] (decoder2)at ([xshift=6em]box2.east){解码器};
\draw[->,thick] ([xshift=-2.7em]cnn1.180) -- ([xshift=-0.1em]cnn1.180);
......
......@@ -29,7 +29,7 @@
\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=2.2em,fill=red!20](encoder) at ([yshift=2.6em,xshift=2.2em]cnn1.north){编码器};
\node[anchor=north,font=\Large](x) at ([xshift=2.5em,yshift=-3.4em]encoder.south){$\seq{x}$};
\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=2.2em,fill=ugreen!20](decoder) at ([yshift=2.6em,xshift=2.2em]cnn2.north){解码器};
\node[draw,anchor=south,rounded corners=2pt,minimum width=4.0cm,minimum height=2.2em,fill=blue!20](decoder) at ([yshift=2.6em,xshift=2.2em]cnn2.north){解码器};
\node[anchor=north,font=\Large](y) at ([xshift=2.5em,yshift=-3.4em]decoder.south){$\seq{y}$};
\node[anchor=south,font=\Large](y_1) at ([yshift=3em]decoder.north){$\seq{y}'$};
......@@ -41,7 +41,8 @@
\draw[->,thick] ([yshift=0.3em]y.north) -- ([yshift=4.7em]y.south);
\draw[->,thick] ([xshift=0.1em]encoder.east) -- ([xshift=-0.1em]decoder.west);
\draw[->,thick] ([yshift=0.1em]decoder.north) -- ([yshift=-0.1em]y_1.south);
\node [anchor=south,scale=1.2] (node1) at ([xshift=-2.0em,yshift=2.5em]encoder.north) {{$x,y$:双语数据}};
\node [anchor=south,scale=1.2] (node1) at ([xshift=-0.7em,yshift=4em]encoder.north) {\small{$x$:源语言文本数据}};
\node [anchor=north,scale=1.2] (node2) at ([xshift=0.57em]node1.south){\small{$y$:目标语言文本数据}};
\end{tikzpicture}
%------------------------------------------------------------------------------------------------------------
......@@ -20,11 +20,11 @@
\node[layer,anchor=south,fill=blue!20] (sf) at ([yshift=1.6em]de_ffn.north){Softmax};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){Output Probabilities};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){Transcription\\(Embedding)};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1.1em]de_add.south){标注文本\\编码表示};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
......@@ -47,6 +47,6 @@
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ASR \\ Encoder};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ASR \\ Decoder};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ASR \\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ASR \\ 解码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -22,11 +22,11 @@
\node[layer,anchor=south,fill=orange!20] (en_output) at ([yshift=1.4em]en_sf.north){CTC Output};
%\node[layer,anchor=south,fill=orange!20] (output) at ([yshift=1.4em]sf.north){ST Output};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){Speech Feature\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){Target Text\\(Embedding)};
\node[anchor=north,font=\scriptsize,align=center] (en_input) at ([yshift=-1em]en_cnn.south){语音特征\\(FBank/MFCC)};
\node[anchor=north,font=\scriptsize,align=center] (de_input) at ([yshift=-1em]de_add.south){标注文本\\编码表示};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){Position\\(Embedding)};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){Position\\(Embedding)};
\node[anchor=east,font=\scriptsize,align=center] (en_pos) at ([xshift=-2em]en_add.west){位置编码};
\node[anchor=west,font=\scriptsize,align=center] (de_pos) at ([xshift=2em]de_add.east){位置编码};
\draw[->] (en_input.90) -- ([yshift=-0.1em]en_cnn.-90);
\draw[->] ([yshift=0.1em]en_cnn.90) -- ([yshift=-0.1em]en_add.-90);
......@@ -51,6 +51,6 @@
\node[anchor=east,font=\scriptsize,text=ugreen] at ([xshift=-0.1em]box1.west){$N \times$};
\node[anchor=west,font=\scriptsize,text=red] at ([xshift=0.1em]box2.east){$\times N$};
\node[anchor=east,font=\scriptsize] at ([xshift=-0.1em]en_cnn.west){$2 \times$};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\Encoder};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\Decoder};
\node[anchor=east,font=\scriptsize,align=center,text=ugreen] at ([xshift=-0.1em,yshift=3em]box1.west){ST\\ 编码器};
\node[anchor=west,font=\scriptsize,align=center,text=red] at ([xshift=0.1em,yshift=5em]box2.east){ST\\解码器};
\end{tikzpicture}
\ No newline at end of file
......@@ -2,16 +2,16 @@
\begin{tikzpicture}[node distance = 0,scale = 0.9]
\tikzstyle{every node}=[scale=0.9]
\node(figure)[draw=white,scale=0.4] {\includegraphics[width=0.62\textwidth]{./Chapter17/Figures/figure-bank-without-attention.png}};
\node(river)[word, right of = figure, xshift=5cm, yshift=0.35cm, fill=ugreen!45]{river};
\node(mountain)[word, above of = river, yshift=0.75cm, fill=ugreen!45]{mountain};
\node(child)[word, above of = mountain, yshift=0.75cm, fill=ugreen!15]{child};
\node(man)[word, above of = child, yshift=0.75cm, fill=ugreen!25]{man};
\node(jump)[word, below of = river, yshift=-0.75cm, fill=ugreen!30]{jump};
\node(bank)[word, below of = jump, yshift=-0.75cm, fill=ugreen!65]{bank};
\node(sky)[word, below of = bank, yshift=-0.75cm, fill=ugreen!30]{sky};
\node(tree)[word, below of = sky, yshift=-0.75cm, fill=ugreen!15]{tree};
\node(river)[word, right of = figure, xshift=5cm, yshift=0.35cm, fill=blue!45]{river};
\node(mountain)[word, above of = river, yshift=0.75cm, fill=blue!45]{mountain};
\node(child)[word, above of = mountain, yshift=0.75cm, fill=blue!15]{child};
\node(man)[word, above of = child, yshift=0.75cm, fill=blue!25]{man};
\node(jump)[word, below of = river, yshift=-0.75cm, fill=blue!30]{jump};
\node(bank)[word, below of = jump, yshift=-0.75cm, fill=blue!65]{bank};
\node(sky)[word, below of = bank, yshift=-0.75cm, fill=blue!30]{sky};
\node(tree)[word, below of = sky, yshift=-0.75cm, fill=blue!15]{tree};
\node(cir)[circle,thick, minimum width=0.6cm, xshift=8cm, draw=black]{};
\node(decoder)[rectangle, rounded corners, minimum height=2.2em,minimum width=4.3em, right of = cir,xshift=3cm, draw=black, fill=ugreen!25]{\large{解码器}};
\node(decoder)[rectangle, rounded corners, minimum height=2.2em,minimum width=4.3em, right of = cir,xshift=3cm, draw=black, fill=blue!25]{\large{解码器}};
\node(yn_1)[below of = decoder,yshift=-2cm,scale=1.2]{$\rm y_{n-1}$};
\node(yn_2)[above of = decoder,yshift=2cm,scale=1.2]{$\rm y'_{n-1}$(bank)};
......@@ -22,12 +22,12 @@
\draw[->, thick](yn_1)to([yshift=-0.1cm]decoder.south);
\draw[->, thick]([yshift=0.1cm]decoder.north)to(yn_2);
\draw[->, thick, color=ugreen!45]([xshift=0.05cm]river.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!45]([xshift=0.05cm]mountain.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!15]([xshift=0.05cm]child.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!25]([xshift=0.05cm]man.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!30]([xshift=0.05cm]jump.east)to([xshift=-0.05cm]cir.west);
\draw[->, very thick, color=ugreen!65]([xshift=0.05cm]bank.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!30]([xshift=0.05cm]sky.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=ugreen!15]([xshift=0.05cm]tree.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!45]([xshift=0.05cm]river.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!45]([xshift=0.05cm]mountain.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!15]([xshift=0.05cm]child.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!25]([xshift=0.05cm]man.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!30]([xshift=0.05cm]jump.east)to([xshift=-0.05cm]cir.west);
\draw[->, very thick, color=blue!65]([xshift=0.05cm]bank.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!30]([xshift=0.05cm]sky.east)to([xshift=-0.05cm]cir.west);
\draw[->, thick, color=blue!15]([xshift=0.05cm]tree.east)to([xshift=-0.05cm]cir.west);
\end{tikzpicture}
\ No newline at end of file
......@@ -4,7 +4,7 @@
\tikzstyle{every node}=[scale=0.75]
\node(encoder)[coder]at (0,0){\large{编码器}};
\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.8cm,fill=ugreen!20]{\large{解码器}};
\node(decoder_1)[coder,above of =encoder,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
\node(decoder_2)[coder,above of =encoder, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
\node(s)[below of = encoder,yshift=-1.8cm,scale=1.2]{$s$};
\node(x)[above of = decoder_1,yshift=1.8cm,scale=1.2]{$x$};
......@@ -16,11 +16,12 @@
\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=-4.16em,yshift=0.7cm]encoder.north)--(decoder_1.south);
\draw[->,thick](encoder.north)--([yshift=0.7cm]encoder.north)--([xshift=4.16em,yshift=0.7cm]encoder.north)--(decoder_2.south);
\node [anchor=north,scale = 1.2](pos1) at (s.south) {(a) 单编码器-双解码器};
\node [anchor=south,scale=1.2] (node1) at ([xshift=-2.0em,yshift=6em]decoder_1.north) {{$x,y$:语言数据}};
\node [anchor=north,scale=1.2] (node2) at ([xshift=0.6em]node1.south){{$s$:语音数据}};
\node [anchor=south,scale=1.2] (node1) at ([xshift=-2.0em,yshift=9em]decoder_1.north) {{$x$:源语言文本数据}};
\node [anchor=north,scale=1.2] (node2) at ([xshift=0.63em]node1.south){{$y$:目标语言文本数据}};
\node [anchor=north,scale=1.2] (node3) at ([xshift=-0.57em]node2.south){{$s$:源语言语音数据}};
%%%%%%%%%%%%%%%%%%%%%%%%级联
\node(encoder-2)[coder]at ([xshift=12.0em]encoder.east){\large{编码器}};
\node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=ugreen!20]{\large{解码器}};
\node(decoder_1-2)[coder,above of =encoder-2,yshift=1.4cm,fill=blue!20]{\large{解码器}};
\node(decoder_2-2)[coder,above of =decoder_1-2, yshift=1.4cm,fill=yellow!20]{\large{解码器}};
\node(s-2)[below of = encoder-2,yshift=-1.8cm,scale=1.2]{$s$};
\node(y-2)[above of = decoder_2-2,yshift=1.8cm,scale=1.2]{$y$};
......@@ -32,7 +33,7 @@
\node [anchor=north,scale = 1.2](pos2) at (s-2.south) {(b) 级联编码器};
%%%%%%%%%%%%%%%%%%%%%%%%联合
\node(encoder-3)[coder]at([xshift=10.0em]encoder-2.east){\large{编码器}};
\node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=ugreen!20]{\large{解码器}};
\node(decoder_1-3)[coder,above of =encoder-3,xshift=-1.6cm,yshift=2.8cm,fill=blue!20]{\large{解码器}};
\node(decoder_2-3)[coder,above of =encoder-3, xshift=1.6cm,yshift=2.8cm,fill=yellow!20]{\large{解码器}};
\node(s-3)[below of = encoder-3,yshift=-1.8cm,scale=1.2]{$s$};
\node(y-3)[above of = decoder_2-3,yshift=1.8cm,scale=1.2]{$y$};
......
......@@ -75,7 +75,7 @@
\parinterval 经过上面的描述可以看出,音频的表示实际上是一个非常长的采样点序列,这导致了直接使用现有的深度学习技术处理音频序列较为困难。并且,原始的音频信号中可能包含着较多的噪声、环境声或冗余信息,也会对模型产生干扰。因此,一般会对音频序列进行处理来提取声学特征,具体为将长序列的采样点序列转换为短序列的特征向量序列,再用于下游系统模块。虽然已有一些工作不依赖特征提取,直接在原始的采样点序列上进行声学建模和模型训练\upcite{DBLP:conf/interspeech/SainathWSWV15},但目前的主流方法仍然是基于声学特征进行建模\upcite{DBLP:conf/icassp/MohamedHP12}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分,目的是使频谱更加平滑。分帧(原理如图\ref{fig:17-3})是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}{\color{red} 参考文献!!!}
\parinterval 声学特征提取的第一步是预处理。其流程主要是对音频进行预加重、分帧和加窗。预加重用来提升音频信号中的高频部分,目的是使频谱更加平滑。分帧(原理如图\ref{fig:17-3})是基于短时平稳假设,即根据生物学特征,语音信号是一个缓慢变化的过程,10ms$\thicksim$30ms的信号片段是相对平稳的。基于这个假设,一般将每25ms作为一帧来提取特征,这个时间称为{\small\bfnew{帧长}}\index{帧长}(Frame Length)\index{Frame Length}。同时,为了保证不同帧之间的信号平滑性,使每两个相邻帧之间存在一定的重合部分。一般每隔10ms取一帧,这个时长称为{\small\bfnew{帧移}}\index{帧移}(Frame Shift)\index{Frame Shift}。为了缓解分帧带来的频谱泄漏,对每帧的信号进行加窗处理使其幅度在两段渐变到0,一般采用的是{\small\bfnew{汉明窗}}\index{汉明窗}(Hamming)\index{Hamming}\upcite{洪青阳2020语音识别原理与应用}
%----------------------------------------------------------------------------------------------------
\begin{figure}[htp]
\centering
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论