Commit 6c6d089a by xuchen

modify the implementation of conformer

parent 922ef3d9
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-params:
#load-pretrained-encoder-from:
arch: s2t_conformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: true
use-cnn-module: true
cnn-module-kernel: 31
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
......@@ -92,7 +92,7 @@ class S2TConformerEncoder(S2TTransformerEncoder):
def __init__(self, args, task=None, embed_tokens=None):
super().__init__(args, task, embed_tokens)
self.transformer_layers = nn.ModuleList(
self.conformer_layers = nn.ModuleList(
[ConformerEncoderLayer(args) for _ in range(args.encoder_layers)]
)
......@@ -107,7 +107,7 @@ class S2TConformerEncoder(S2TTransformerEncoder):
x = self.dropout_module(x)
positions = self.dropout_module(positions)
for layer in self.transformer_layers:
for layer in self.conformer_layers:
x = layer(x, encoder_padding_mask, pos_emb=positions)
if self.layer_norm is not None:
......
......@@ -9,7 +9,6 @@ import torch
import torch.nn as nn
from fairseq import utils
from fairseq.modules import LayerNorm, MultiheadAttention, RelPositionMultiheadAttention, ConvolutionModule
# from .layer_norm import LayerNorm
from fairseq.modules.fairseq_dropout import FairseqDropout
from fairseq.modules.quant_noise import quant_noise
from torch import Tensor
......@@ -66,17 +65,17 @@ class ConformerEncoderLayer(nn.Module):
self.quant_noise_block_size,
)
self.macaron_norm = LayerNorm(self.embed_dim)
self.ff_scale = 0.5
self.ffn_scale = 0.5
else:
self.macaron_fc1 = None
self.macaron_fc2 = None
self.macaron_norm = None
self.ff_scale = 1.0
self.ffn_scale = 1.0
if args.use_cnn_module:
self.conv_norm = LayerNorm(self.embed_dim)
self.conv_module = ConvolutionModule(self.embed_dim, args.cnn_module_kernel, self.activation_fn)
self.final_norm(self.embed_dim)
self.final_norm = LayerNorm(self.embed_dim)
else:
self.conv_norm = False
self.conv_module = None
......@@ -96,7 +95,7 @@ class ConformerEncoderLayer(nn.Module):
self.quant_noise_block_size,
)
self.ff_norm = LayerNorm(self.embed_dim)
self.ffn_norm = LayerNorm(self.embed_dim)
def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
return quant_noise(
......@@ -178,7 +177,7 @@ class ConformerEncoderLayer(nn.Module):
if self.normalize_before:
x = self.macaron_norm(x)
x = self.macaron_fc2(self.activation_dropout_module(self.activation_fn(self.macaron_fc1(x))))
x = residual + self.ff_scale * self.dropout_module(x)
x = residual + self.ffn_scale * self.dropout_module(x)
if not self.normalize_before:
x = self.macaron_norm(x)
......@@ -214,23 +213,23 @@ class ConformerEncoderLayer(nn.Module):
if self.conv_module is not None:
residual = x
if self.normalize_before:
x = self.norm_conv(x)
x = self.conv_norm(x)
x = residual + self.dropout_module(self.conv_module(x))
if not self.normalize_before:
x = self.norm_conv(x)
x = self.conv_norm(x)
residual = x
if self.normalize_before:
x = self.ff_norm(x)
x = self.ffn_norm(x)
x = self.activation_fn(self.fc1(x))
x = self.activation_dropout_module(x)
x = self.fc2(x)
x = self.dropout_module(x)
x = self.residual_connection(x, residual)
x = self.residual_connection(self.ffn_scale * x, residual)
if not self.normalize_before:
x = self.ff_norm(x)
x = self.ffn_norm(x)
if self.conv_module is not None:
x = self.norm_final(x)
x = self.final_norm(x)
return x
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论