Commit 6c6d089a by xuchen

modify the implementation of conformer

parent 922ef3d9
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-params:
#load-pretrained-encoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
...@@ -92,7 +92,7 @@ class S2TConformerEncoder(S2TTransformerEncoder): ...@@ -92,7 +92,7 @@ class S2TConformerEncoder(S2TTransformerEncoder):
def __init__(self, args, task=None, embed_tokens=None): def __init__(self, args, task=None, embed_tokens=None):
super().__init__(args, task, embed_tokens) super().__init__(args, task, embed_tokens)
self.transformer_layers = nn.ModuleList( self.conformer_layers = nn.ModuleList(
[ConformerEncoderLayer(args) for _ in range(args.encoder_layers)] [ConformerEncoderLayer(args) for _ in range(args.encoder_layers)]
) )
...@@ -107,7 +107,7 @@ class S2TConformerEncoder(S2TTransformerEncoder): ...@@ -107,7 +107,7 @@ class S2TConformerEncoder(S2TTransformerEncoder):
x = self.dropout_module(x) x = self.dropout_module(x)
positions = self.dropout_module(positions) positions = self.dropout_module(positions)
for layer in self.transformer_layers: for layer in self.conformer_layers:
x = layer(x, encoder_padding_mask, pos_emb=positions) x = layer(x, encoder_padding_mask, pos_emb=positions)
if self.layer_norm is not None: if self.layer_norm is not None:
......
...@@ -9,7 +9,6 @@ import torch ...@@ -9,7 +9,6 @@ import torch
import torch.nn as nn import torch.nn as nn
from fairseq import utils from fairseq import utils
from fairseq.modules import LayerNorm, MultiheadAttention, RelPositionMultiheadAttention, ConvolutionModule from fairseq.modules import LayerNorm, MultiheadAttention, RelPositionMultiheadAttention, ConvolutionModule
# from .layer_norm import LayerNorm
from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.fairseq_dropout import FairseqDropout
from fairseq.modules.quant_noise import quant_noise from fairseq.modules.quant_noise import quant_noise
from torch import Tensor from torch import Tensor
...@@ -66,17 +65,17 @@ class ConformerEncoderLayer(nn.Module): ...@@ -66,17 +65,17 @@ class ConformerEncoderLayer(nn.Module):
self.quant_noise_block_size, self.quant_noise_block_size,
) )
self.macaron_norm = LayerNorm(self.embed_dim) self.macaron_norm = LayerNorm(self.embed_dim)
self.ff_scale = 0.5 self.ffn_scale = 0.5
else: else:
self.macaron_fc1 = None self.macaron_fc1 = None
self.macaron_fc2 = None self.macaron_fc2 = None
self.macaron_norm = None self.macaron_norm = None
self.ff_scale = 1.0 self.ffn_scale = 1.0
if args.use_cnn_module: if args.use_cnn_module:
self.conv_norm = LayerNorm(self.embed_dim) self.conv_norm = LayerNorm(self.embed_dim)
self.conv_module = ConvolutionModule(self.embed_dim, args.cnn_module_kernel, self.activation_fn) self.conv_module = ConvolutionModule(self.embed_dim, args.cnn_module_kernel, self.activation_fn)
self.final_norm(self.embed_dim) self.final_norm = LayerNorm(self.embed_dim)
else: else:
self.conv_norm = False self.conv_norm = False
self.conv_module = None self.conv_module = None
...@@ -96,7 +95,7 @@ class ConformerEncoderLayer(nn.Module): ...@@ -96,7 +95,7 @@ class ConformerEncoderLayer(nn.Module):
self.quant_noise_block_size, self.quant_noise_block_size,
) )
self.ff_norm = LayerNorm(self.embed_dim) self.ffn_norm = LayerNorm(self.embed_dim)
def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
return quant_noise( return quant_noise(
...@@ -178,7 +177,7 @@ class ConformerEncoderLayer(nn.Module): ...@@ -178,7 +177,7 @@ class ConformerEncoderLayer(nn.Module):
if self.normalize_before: if self.normalize_before:
x = self.macaron_norm(x) x = self.macaron_norm(x)
x = self.macaron_fc2(self.activation_dropout_module(self.activation_fn(self.macaron_fc1(x)))) x = self.macaron_fc2(self.activation_dropout_module(self.activation_fn(self.macaron_fc1(x))))
x = residual + self.ff_scale * self.dropout_module(x) x = residual + self.ffn_scale * self.dropout_module(x)
if not self.normalize_before: if not self.normalize_before:
x = self.macaron_norm(x) x = self.macaron_norm(x)
...@@ -214,23 +213,23 @@ class ConformerEncoderLayer(nn.Module): ...@@ -214,23 +213,23 @@ class ConformerEncoderLayer(nn.Module):
if self.conv_module is not None: if self.conv_module is not None:
residual = x residual = x
if self.normalize_before: if self.normalize_before:
x = self.norm_conv(x) x = self.conv_norm(x)
x = residual + self.dropout_module(self.conv_module(x)) x = residual + self.dropout_module(self.conv_module(x))
if not self.normalize_before: if not self.normalize_before:
x = self.norm_conv(x) x = self.conv_norm(x)
residual = x residual = x
if self.normalize_before: if self.normalize_before:
x = self.ff_norm(x) x = self.ffn_norm(x)
x = self.activation_fn(self.fc1(x)) x = self.activation_fn(self.fc1(x))
x = self.activation_dropout_module(x) x = self.activation_dropout_module(x)
x = self.fc2(x) x = self.fc2(x)
x = self.dropout_module(x) x = self.dropout_module(x)
x = self.residual_connection(x, residual) x = self.residual_connection(self.ffn_scale * x, residual)
if not self.normalize_before: if not self.normalize_before:
x = self.ff_norm(x) x = self.ffn_norm(x)
if self.conv_module is not None: if self.conv_module is not None:
x = self.norm_final(x) x = self.final_norm(x)
return x return x
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论