Commit 919a3e57 by xuchen

add covost recipes

parent 9e958e0c
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(2019)
splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
train-subset: train_mix_1,train_mix_2,train_mix_3,train_mix_4,train_mix_5,train_mix_6,train_mix_7,train_mix_8,train_mix_9,train_mix_10,train_mix_11,train_mix_12,train_mix_13,train_mix_14,train_mix_15,train_mix_16
#train-subset: train_all
valid-subset: dev_en-de
ignore-prefix-size: 1
sharded-data-load: True
max-epoch: 300
max-update: 300000
patience: 20
post-process: sentencepiece
#best_checkpoint_metric: loss
#maximize_best_checkpoint_metric: False
eval-bleu: True
eval-bleu-args: {"beam": 5, "lenpen": 1.0, "prefix_size": 1}
eval-bleu-detok: moses
eval-bleu-remove-bpe: sentencepiece
eval-bleu-print-samples: True
best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True
# no-epoch-checkpoints: True
validate-interval: 4
keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
label-smoothing: 0.1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
ctc-weight: 0.3
share-ctc-and-embed: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
arch: s2t_dual
asr-encoder: transformer
mt-encoder-layers: 6
mt-encoder: transformer
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: join_speech_and_text_loss
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-asr-encoder-from:
#load-pretrained-mt-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
compression-metric: threshold
compression-mode: create
compression-layers: 6,9
compression-threshold: 0.99
compression-norm: True
compression-pos: True
\ No newline at end of file
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
# ctc-pae-ground-truth-ratio: 0.1
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-ctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
inter-ctc-mlo: 1:2:3
\ No newline at end of file
arch: s2t_multibranch
junior-acoustic-encoder: transformer
acoustic-adapter: none
textual-adapter: none
junior-acoustic-encoder-layers: 12
senior-acoustic-encoder-layers: 6
textual-encoder-layers: 6
# collaboration-direction: none
# collaboration-direction: acoustic
# collaboration-direction: textual
collaboration-direction: both
collaboration-start: 0:0
collaboration-step: 1:1
# encoder-collaboration-mode: serial
# decoder-collaboration-mode: serial
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
use-raw-text: False
modality-switch: False
text-to-hidden-progress: 1:0:500
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-out-norm: False
encoder-league-gated: False
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-out-norm: False
decoder-league-gated: False
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
encoder-embed-norm: True
encoder-no-scale-embedding: True
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-attention-heads: 4
decoder-layers: 6
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-junior-acoustic-encoder-from:
#load-pretrained-senior-acoustic-encoder-from:
#load-pretrained-textual-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_32
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 4_4_4_4_4
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
\ No newline at end of file
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
share-ctc-and-embed: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
share-ctc-and-embed: True
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
#xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
#inter-xctc-weight: 0.05
#inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
#xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
#xctc-pae-ground-truth-ratio: 0.1
#xctc-pae-ground-truth-only-mistake: True
#pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# CTC & XCTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.2
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
inter-xctc-weight: 0.1
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 3
cross-attn-layer: 2
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.5
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 4
cross-attn-layer: 3
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: transformer
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9,12,15
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
# xctc-cross-attn: True
# cross-attn-start-layer: 4
# cross-attn-layer: 3
# cross-attn-collaboration-mode: serial
# cross-attn-league-drop-net: True
# cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
ctc-weight: 0.3
share-ctc-and-embed: True
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
# adapter: none
# adapter-embed-norm: True
# adapter-out-norm: True
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
encoder-attention-type: rel_pos
# encoder-attention-type: relative
# decoder-attention-type: relative
# max-encoder-relative-length: 100
# max-decoder-relative-length: 20
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_w2v2_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
w2v2-model-path: /home/xuchen/st/models/w2v2/wav2vec_small.pt
freeze-w2v: False
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
xctc-weight: 0.3
share-xctc-and-embed: True
\ No newline at end of file
inter-xctc-weight: 0.2
inter-xctc-layers: 6,9
xctc-pae: none
# xctc-pae: inter_league
xctc-cross-attn: False
cross-attn-start-layer: 7
cross-attn-layer: 6
cross-attn-collaboration-mode: parallel
cross-attn-league-s1-ratio: 0.5
cross-attn-league-s2-ratio: 0.5
cross-attn-league-out-norm: False
cross-attn-league-gated: False
cross-attn-league-drop-net: False
cross-attn-league-drop-net-prob: 0.2
cross-attn-league-drop-net-mix: False
# xctc-pae-ground-truth-ratio: 0.1
# xctc-pae-ground-truth-ratio-adaptive: True
# xctc-pae-ground-truth-only-mistake: True
# pae-oracle-smooth: True
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-xctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
#!/usr/bin/env bash
gpu_num=1
data_tag=st
test_subset=(dev_en-de test_en-de)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
ctc_infer=0
n_average=1
beam_size=5
infer_ctc_weight=0
len_penalty=1.0
max_tokens=20000
batch_size=0
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--ctc_infer ${ctc_infer}
--infer_ctc_weight ${infer_ctc_weight}
--infer_debug ${infer_debug}
"
if [[ -n ${data_tag} ]]; then
cmd="$cmd --data_tag ${data_tag}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
#!/usr/bin/env bash
set -e
ref=$1
gen=$2
tokenizer=$3
lang=$4
lang_pair=en-${lang}
record=$(mktemp -t temp.record.XXXXXX)
if [[ ${tokenizer} -eq 1 ]]; then
echo "MultiBLEU" > ${record}
cmd="multi-bleu.perl ${ref} < ${gen}"
eval $cmd | head -n 1 >> ${record}
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${ref} > ${ref}.detok"
eval $cmd
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${gen} > ${gen}.detok"
eval $cmd
ref=${ref}.detok
gen=${gen}.detok
fi
echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair} | jq -r .score"
eval $cmd >> ${record}
cat ${record}
rm ${record}
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
tokenizer=$6
lang=$7
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
if [[ ! -f ${ctc_infer_sort} ]]; then
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
fi
gen=${ctc_infer_sort}
./cal_bleu.sh ${ref} ${gen} ${tokenizer} ${lang}
\ No newline at end of file
import unicodedata
import jiwer
import jiwer.transforms as tr
import sys
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
wer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
tr.ReduceToListOfListOfWords(),
]
)
cer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ReduceToListOfListOfChars(),
]
)
ref_lines = open(ref_file, "r").readlines()
hyp_lines = open(hyp_file, "r").readlines()
wer = jiwer.wer(ref_lines, hyp_lines,
truth_transform=wer_standardize,
hypothesis_transform=wer_standardize,
)
cer = jiwer.cer(ref_lines, hyp_lines,
truth_transform=cer_standardize,
hypothesis_transform=cer_standardize,
)
print("WER: %.4f" % wer)
print("CER: %.4f" % cer)
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
python3 ./cal_wer.py ${ref} ${ctc_infer_sort}
\ No newline at end of file
import sys
import csv
tsv_file = sys.argv[1]
out_file = sys.argv[2]
extract_item = sys.argv[3]
with open(tsv_file) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
samples = [dict(e) for e in reader]
fw = open(out_file, "w", encoding="utf-8")
for s in samples:
if extract_item in s:
fw.write("%s\n" % s[extract_item])
else:
print("Error in sample: ")
print(s)
exit()
#!/usr/bin/env bash
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
i=1
argv="$@"
while true; do
key=${!i}
j=$(($i + 1))
value=${!j}
[ -z "${!i:-}" ] && break; # break if there are no arguments
case "${key}" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '${key}}'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "${key}" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
#eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
# echo $name
eval $name=\"${value}\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "${value}" != "true" && "${value}" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": ${key} ${value}" 1>&2
exit 1;
fi
# shift 2;
i=$(($i + 2))
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# Processing uST-C Datasets
# Copyright 2021 Chen Xu (xuchennlp@outlook.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=1
stop_stage=2
######## Hardware ########
# Devices
device=(0)
gpu_num=8
update_freq=1
pwd_dir=$PWD
root_dir=${ST_ROOT}
data_root_dir=${root_dir}
code_dir=${root_dir}/S2T
# Dataset
src_lang=en
tgt_lang=multi
lang=en-multi
dataset=covost
data_tag=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=32000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
. ./local/parse_options.sh || exit 1;
use_specific_dict=0
specific_prefix=valid
specific_dir=${data_root_dir}/data/${dataset}/${lang}/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
data_model_subfix=${dataset}/${lang}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset}
data_dir=${data_root_dir}/data/${data_model_subfix}
lang_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "en-[a-z][a-z]" -printf '%P,')
languages=$(echo "$lang_list" | sed 's/,$//')
train_split=train
valid_split=dev
test_split=test
test_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "en-[a-z][a-z]" -exec basename {} \; | awk -v prefix="test_" '{print prefix $0}')
IFS=','
test_subset="${test_list[*]}"
IFS=$' \t\n'
# Exp
sub_tag=
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# Training Settings
train_config=base,ctc
fp16=1
max_tokens=20000
step_valid=0
bleu_valid=0
# Decoding Settings
batch_size=0
sacrebleu=1
dec_model=checkpoint_best.pt
ctc_infer=0
infer_ctc_weight=0
n_average=10
beam_size=5
len_penalty=1.0
epoch_ensemble=0
best_ensemble=1
infer_debug=0
infer_score=0
infer_tag=
infer_parameter=
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml
else
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
if [[ -n ${exp_subfix} ]]; then
exp_name=${exp_name}_${exp_subfix}
fi
fi
ckpt_dir=${root_dir}/checkpoints/
model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
# Start
cd ${code_dir}
echo "Start Stage: $stage"
echo "Stop Stage: $stop_stage"
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "Default Stage: env configure"
pip3 install -e ${code_dir}
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Stage -1: Data Download"
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "Stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
# create ASR vocabulary if necessary
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
if [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
mkdir -p ${data_dir}/asr4st
eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir}
fi
echo "Stage 0: ST Data Preparation"
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--joint
--add-src
--languages ${languages}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Stage 1: Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
echo -e "data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir}
train_config=basis,${train_config}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp -f ${config_path} ${model_dir}
if [[ $idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=${ckpt_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then
# tensorboard
port=6666
tensorboard --logdir ${model_dir} --port ${port} --bind_all &
echo "${cmd}" > ${model_dir}/cmd
eval $cmd
#sleep 2s
#tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Stage 2: Decoding"
dec_models=
if [[ ${n_average} -eq 1 ]]; then
dec_models=${dec_model}
fi
if [[ ${n_average} -ne 1 ]]; then
# Average models
if [[ ${epoch_ensemble} -eq 1 ]]; then
avg_model=avg_epoch${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
if [[ ${best_ensemble} -eq 1 ]]; then
avg_model=avg_best${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
for dec_model in ${dec_models[@]}; do
suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str}
if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu
else
suffix=${suffix}_multibleu
fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${ctc_infer} -eq 1 ]]; then
suffix=${suffix}_ctc
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
suffix=${suffix}_ensemble
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
suffix=${suffix}_logit${ctc_inter_logit}
fi
if (( $(echo "${infer_ctc_weight} > 0" | bc -l) )); then
suffix=${suffix}_ctc${infer_ctc_weight}
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
if [[ -n ${infer_tag} ]]; then
suffix=${suffix}_${infer_tag}
fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens}
--beam ${beam_size}
--prefix-size 1
--skip-invalid-size-inputs-valid-test
--infer-ctc-weight ${infer_ctc_weight}
--lenpen ${len_penalty}"
if [[ ${ctc_infer} -eq 1 ]]; then
cmd="${cmd}
--ctc-infer"
fi
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ "${tgt_lang}" = "ja" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer ja-mecab"
elif [[ "${tgt_lang}" == "zh" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer zh"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--source-lang ${src_lang}
--target-lang ${tgt_lang}"
fi
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
cmd="${cmd}
--ctc-self-ensemble"
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
cmd="${cmd}
--ctc-inter-logit ${ctc_inter_logit}"
fi
if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd}
--score-reference"
fi
if [[ -n ${infer_parameter} ]]; then
cmd="${cmd}
${infer_parameter}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
cd ${code_dir}
if [[ $eval -eq 1 ]]; then
ctc_file=translation-${subset}.ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
rm ${model_dir}/${ctc_file}
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
rm ${model_dir}/${xctc_file}
fi
eval $cmd
echo "" >> ${result_file}
tail -n 2 ${model_dir}/generate-${subset}.txt >> ${result_file}
mv ${model_dir}/generate-${subset}.txt ${model_dir}/generate-${subset}-${suffix}.txt
mv ${model_dir}/translation-${subset}.txt ${model_dir}/translation-${subset}-${suffix}.txt
cd ${pwd_dir}
if [[ -f ${model_dir}/enc_dump ]]; then
mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
fi
if [[ -f ${model_dir}/dec_dump ]]; then
mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
fi
trans_file=translation-${subset}-${suffix}.txt
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${src_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
fi
if [[ -f ${ref_file} ]]; then
ctc=$(mktemp -t temp.record.XXXXXX)
cd ./local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} > ${ctc}
cd ..
echo "CTC WER" >> ${result_file}
tail -n 2 ${ctc} >> ${result_file}
src_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
cd ..
cat ${src_bleu} >> ${result_file}
rm ${ctc} ${src_bleu}
else
echo "No reference for source language."
fi
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
ref_file=${model_dir}/${subset}.${tgt_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text"
fi
if [[ -f ${ref_file} ]]; then
xctc=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} > ${xctc}
cd ..
echo "XCTC WER" >> ${result_file}
tail -n 2 ${xctc} >> ${result_file}
tgt_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu}
cd ..
cat ${tgt_bleu} >> ${result_file}
rm ${xctc} ${tgt_bleu}
else
echo "No reference for target language."
fi
fi
fi
done
echo
echo "" >> ${result_file}
cat ${result_file}
done
fi
#!/usr/bin/env bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
# Base
#config_list=(base conformer ctc)
# SATE
config_list=(sate conformer ctc)
# PDS
#config_list=(pds_base_8 conformer ctc)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 2
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(2019)
splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
#train-subset: train_mix_1,train_mix_2,train_mix_3,train_mix_4,train_mix_5,train_mix_6,train_mix_7,train_mix_8
train-subset: train_all
valid-subset: dev_de-en
#ignore-prefix-size: 1
#sharded-data-load: True
max-epoch: 300
max-update: 300000
patience: 20
post-process: sentencepiece
#best_checkpoint_metric: loss
#maximize_best_checkpoint_metric: False
eval-bleu: True
#eval-bleu-args: {"beam": 5, "lenpen": 1.0, "prefix_size": 1}
eval-bleu-args: {"beam": 5, "lenpen": 1.0}
eval-bleu-detok: moses
eval-bleu-remove-bpe: sentencepiece
eval-bleu-print-samples: True
best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True
# no-epoch-checkpoints: True
validate-interval: 1
keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
label-smoothing: 0.1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
ctc-weight: 0.3
share-ctc-and-embed: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
arch: s2t_dual
asr-encoder: transformer
mt-encoder-layers: 6
mt-encoder: transformer
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: join_speech_and_text_loss
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-asr-encoder-from:
#load-pretrained-mt-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
compression-metric: threshold
compression-mode: create
compression-layers: 6,9
compression-threshold: 0.99
compression-norm: True
compression-pos: True
\ No newline at end of file
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
# ctc-pae-ground-truth-ratio: 0.1
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-ctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
inter-ctc-mlo: 1:2:3
\ No newline at end of file
arch: s2t_multibranch
junior-acoustic-encoder: transformer
acoustic-adapter: none
textual-adapter: none
junior-acoustic-encoder-layers: 12
senior-acoustic-encoder-layers: 6
textual-encoder-layers: 6
# collaboration-direction: none
# collaboration-direction: acoustic
# collaboration-direction: textual
collaboration-direction: both
collaboration-start: 0:0
collaboration-step: 1:1
# encoder-collaboration-mode: serial
# decoder-collaboration-mode: serial
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
use-raw-text: False
modality-switch: False
text-to-hidden-progress: 1:0:500
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-out-norm: False
encoder-league-gated: False
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-out-norm: False
decoder-league-gated: False
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
encoder-embed-norm: True
encoder-no-scale-embedding: True
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-attention-heads: 4
decoder-layers: 6
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-junior-acoustic-encoder-from:
#load-pretrained-senior-acoustic-encoder-from:
#load-pretrained-textual-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_32
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 4_4_4_4_4
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
\ No newline at end of file
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
share-ctc-and-embed: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
share-ctc-and-embed: True
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
#xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
#inter-xctc-weight: 0.05
#inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
#xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
#xctc-pae-ground-truth-ratio: 0.1
#xctc-pae-ground-truth-only-mistake: True
#pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# CTC & XCTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.2
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
inter-xctc-weight: 0.1
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 3
cross-attn-layer: 2
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.5
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 4
cross-attn-layer: 3
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: transformer
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9,12,15
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
# xctc-cross-attn: True
# cross-attn-start-layer: 4
# cross-attn-layer: 3
# cross-attn-collaboration-mode: serial
# cross-attn-league-drop-net: True
# cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
ctc-weight: 0.3
share-ctc-and-embed: True
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
# adapter: none
# adapter-embed-norm: True
# adapter-out-norm: True
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
encoder-attention-type: rel_pos
# encoder-attention-type: relative
# decoder-attention-type: relative
# max-encoder-relative-length: 100
# max-decoder-relative-length: 20
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_w2v2_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
w2v2-model-path: /home/xuchen/st/models/w2v2/wav2vec_small.pt
freeze-w2v: False
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
xctc-weight: 0.3
share-xctc-and-embed: True
\ No newline at end of file
inter-xctc-weight: 0.2
inter-xctc-layers: 6,9
xctc-pae: none
# xctc-pae: inter_league
xctc-cross-attn: False
cross-attn-start-layer: 7
cross-attn-layer: 6
cross-attn-collaboration-mode: parallel
cross-attn-league-s1-ratio: 0.5
cross-attn-league-s2-ratio: 0.5
cross-attn-league-out-norm: False
cross-attn-league-gated: False
cross-attn-league-drop-net: False
cross-attn-league-drop-net-prob: 0.2
cross-attn-league-drop-net-mix: False
# xctc-pae-ground-truth-ratio: 0.1
# xctc-pae-ground-truth-ratio-adaptive: True
# xctc-pae-ground-truth-only-mistake: True
# pae-oracle-smooth: True
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-xctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
#!/usr/bin/env bash
gpu_num=1
data_tag=st
test_subset=(tst-COMMON_en-de tst-COMMON_en-fr tst-COMMON_en-es tst-COMMON_en-it tst-COMMON_en-nl tst-COMMON_en-pt tst-COMMON_en-ro tst-COMMON_en-ru)
#test_subset=(tst-COMMON_en-de)
#test_subset=(test_en-fr_1k)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
ctc_infer=1
n_average=10
beam_size=5
infer_ctc_weight=0.1
len_penalty=1.0
max_tokens=20000
batch_size=1
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--ctc_infer ${ctc_infer}
--infer_ctc_weight ${infer_ctc_weight}
--infer_debug ${infer_debug}
"
if [[ -n ${data_tag} ]]; then
cmd="$cmd --data_tag ${data_tag}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
#!/usr/bin/env bash
set -e
ref=$1
gen=$2
tokenizer=$3
lang=$4
lang_pair=en-${lang}
record=$(mktemp -t temp.record.XXXXXX)
if [[ ${tokenizer} -eq 1 ]]; then
echo "MultiBLEU" > ${record}
cmd="multi-bleu.perl ${ref} < ${gen}"
eval $cmd | head -n 1 >> ${record}
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${ref} > ${ref}.detok"
eval $cmd
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${gen} > ${gen}.detok"
eval $cmd
ref=${ref}.detok
gen=${gen}.detok
fi
echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair} | jq -r .score"
eval $cmd >> ${record}
cat ${record}
rm ${record}
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
tokenizer=$6
lang=$7
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
if [[ ! -f ${ctc_infer_sort} ]]; then
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
fi
gen=${ctc_infer_sort}
./cal_bleu.sh ${ref} ${gen} ${tokenizer} ${lang}
\ No newline at end of file
import unicodedata
import jiwer
import jiwer.transforms as tr
import sys
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
wer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
tr.ReduceToListOfListOfWords(),
]
)
cer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ReduceToListOfListOfChars(),
]
)
ref_lines = open(ref_file, "r").readlines()
hyp_lines = open(hyp_file, "r").readlines()
wer = jiwer.wer(ref_lines, hyp_lines,
truth_transform=wer_standardize,
hypothesis_transform=wer_standardize,
)
cer = jiwer.cer(ref_lines, hyp_lines,
truth_transform=cer_standardize,
hypothesis_transform=cer_standardize,
)
print("WER: %.4f" % wer)
print("CER: %.4f" % cer)
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
python3 ./cal_wer.py ${ref} ${ctc_infer_sort}
\ No newline at end of file
import sys
import csv
tsv_file = sys.argv[1]
out_file = sys.argv[2]
extract_item = sys.argv[3]
with open(tsv_file) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
samples = [dict(e) for e in reader]
fw = open(out_file, "w", encoding="utf-8")
for s in samples:
if extract_item in s:
fw.write("%s\n" % s[extract_item])
else:
print("Error in sample: ")
print(s)
exit()
#!/usr/bin/env bash
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
i=1
argv="$@"
while true; do
key=${!i}
j=$(($i + 1))
value=${!j}
[ -z "${!i:-}" ] && break; # break if there are no arguments
case "${key}" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '${key}}'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "${key}" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
#eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
# echo $name
eval $name=\"${value}\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "${value}" != "true" && "${value}" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": ${key} ${value}" 1>&2
exit 1;
fi
# shift 2;
i=$(($i + 2))
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# Processing uST-C Datasets
# Copyright 2021 Chen Xu (xuchennlp@outlook.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=1
stop_stage=2
######## Hardware ########
# Devices
device=(0)
gpu_num=8
update_freq=1
pwd_dir=$PWD
root_dir=${ST_ROOT}
data_root_dir=${root_dir}
code_dir=${root_dir}/S2T
# Dataset
src_lang=multi
tgt_lang=en
lang=multi-en
dataset=covost
data_tag=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=32000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
. ./local/parse_options.sh || exit 1;
use_specific_dict=0
specific_prefix=valid
specific_dir=${data_root_dir}/data/${dataset}/${lang}/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
data_model_subfix=${dataset}/${lang}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset}
data_dir=${data_root_dir}/data/${data_model_subfix}
lang_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "[a-z][a-z]-en" -printf '%P,')
languages=$(echo "$lang_list" | sed 's/,$//')
train_split=train
valid_split=dev
test_split=test
test_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "[a-z][a-z]-en" -exec basename {} \; | awk -v prefix="test_" '{print prefix $0}')
IFS=','
test_subset="${test_list[*]}"
IFS=$' \t\n'
# Exp
sub_tag=
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# Training Settings
train_config=base,ctc
fp16=1
max_tokens=20000
step_valid=0
bleu_valid=0
# Decoding Settings
batch_size=0
sacrebleu=1
dec_model=checkpoint_best.pt
ctc_infer=0
infer_ctc_weight=0
n_average=10
beam_size=5
len_penalty=1.0
epoch_ensemble=0
best_ensemble=1
infer_debug=0
infer_score=0
infer_tag=
infer_parameter=
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml
else
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
if [[ -n ${exp_subfix} ]]; then
exp_name=${exp_name}_${exp_subfix}
fi
fi
ckpt_dir=${root_dir}/checkpoints/
model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
# Start
cd ${code_dir}
echo "Start Stage: $stage"
echo "Stop Stage: $stop_stage"
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "Default Stage: env configure"
pip3 install -e ${code_dir}
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Stage -1: Data Download"
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "Stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
# create ASR vocabulary if necessary
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
if [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
mkdir -p ${data_dir}/asr4st
eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir}
fi
echo "Stage 0: ST Data Preparation"
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--joint
--add-src
--languages ${languages}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Stage 1: Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
echo -e "data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir}
train_config=basis,${train_config}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp -f ${config_path} ${model_dir}
if [[ $idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=${ckpt_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then
# tensorboard
port=6666
tensorboard --logdir ${model_dir} --port ${port} --bind_all &
echo "${cmd}" > ${model_dir}/cmd
eval $cmd
#sleep 2s
#tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Stage 2: Decoding"
dec_models=
if [[ ${n_average} -eq 1 ]]; then
dec_models=${dec_model}
fi
if [[ ${n_average} -ne 1 ]]; then
# Average models
if [[ ${epoch_ensemble} -eq 1 ]]; then
avg_model=avg_epoch${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
if [[ ${best_ensemble} -eq 1 ]]; then
avg_model=avg_best${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
for dec_model in ${dec_models[@]}; do
suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str}
if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu
else
suffix=${suffix}_multibleu
fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${ctc_infer} -eq 1 ]]; then
suffix=${suffix}_ctc
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
suffix=${suffix}_ensemble
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
suffix=${suffix}_logit${ctc_inter_logit}
fi
if (( $(echo "${infer_ctc_weight} > 0" | bc -l) )); then
suffix=${suffix}_ctc${infer_ctc_weight}
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
if [[ -n ${infer_tag} ]]; then
suffix=${suffix}_${infer_tag}
fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens}
--beam ${beam_size}
--prefix-size 1
--skip-invalid-size-inputs-valid-test
--infer-ctc-weight ${infer_ctc_weight}
--lenpen ${len_penalty}"
if [[ ${ctc_infer} -eq 1 ]]; then
cmd="${cmd}
--ctc-infer"
fi
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ "${tgt_lang}" = "ja" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer ja-mecab"
elif [[ "${tgt_lang}" == "zh" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer zh"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--source-lang ${src_lang}
--target-lang ${tgt_lang}"
fi
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
cmd="${cmd}
--ctc-self-ensemble"
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
cmd="${cmd}
--ctc-inter-logit ${ctc_inter_logit}"
fi
if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd}
--score-reference"
fi
if [[ -n ${infer_parameter} ]]; then
cmd="${cmd}
${infer_parameter}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
cd ${code_dir}
if [[ $eval -eq 1 ]]; then
ctc_file=translation-${subset}.ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
rm ${model_dir}/${ctc_file}
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
rm ${model_dir}/${xctc_file}
fi
eval $cmd
echo "" >> ${result_file}
tail -n 2 ${model_dir}/generate-${subset}.txt >> ${result_file}
mv ${model_dir}/generate-${subset}.txt ${model_dir}/generate-${subset}-${suffix}.txt
mv ${model_dir}/translation-${subset}.txt ${model_dir}/translation-${subset}-${suffix}.txt
cd ${pwd_dir}
if [[ -f ${model_dir}/enc_dump ]]; then
mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
fi
if [[ -f ${model_dir}/dec_dump ]]; then
mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
fi
trans_file=translation-${subset}-${suffix}.txt
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${src_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
fi
if [[ -f ${ref_file} ]]; then
ctc=$(mktemp -t temp.record.XXXXXX)
cd ./local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} > ${ctc}
cd ..
echo "CTC WER" >> ${result_file}
tail -n 2 ${ctc} >> ${result_file}
src_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
cd ..
cat ${src_bleu} >> ${result_file}
rm ${ctc} ${src_bleu}
else
echo "No reference for source language."
fi
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
ref_file=${model_dir}/${subset}.${tgt_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text"
fi
if [[ -f ${ref_file} ]]; then
xctc=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} > ${xctc}
cd ..
echo "XCTC WER" >> ${result_file}
tail -n 2 ${xctc} >> ${result_file}
tgt_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu}
cd ..
cat ${tgt_bleu} >> ${result_file}
rm ${xctc} ${tgt_bleu}
else
echo "No reference for target language."
fi
fi
fi
done
echo
echo "" >> ${result_file}
cat ${result_file}
done
fi
#!/usr/bin/env bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
# Base
#config_list=(base conformer ctc)
# SATE
config_list=(sate conformer ctc)
# PDS
#config_list=(pds_base_8 conformer ctc)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 2
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
vocab_type=unigram
vocab_size=5000
use_raw_audio=0
speed_perturb=0
dataset=mustc
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
use_specific_dict=0
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=zh
subsets=(2019)
splits=$(echo ${subsets[*]} | sed 's/ /_/g')
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${splits}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
train-subset: train_mix_1,train_mix_2,train_mix_3,train_mix_4,train_mix_5,train_mix_6,train_mix_7,train_mix_8,train_mix_9,train_mix_10,train_mix_11,train_mix_12,train_mix_13,train_mix_14,train_mix_15,train_mix_16
valid-subset: dev_de-en,dev_en-de
ignore-prefix-size: 1
#sharded-data-load: True
max-epoch: 300
max-update: 300000
patience: 20
post-process: sentencepiece
#best_checkpoint_metric: loss
#maximize_best_checkpoint_metric: False
eval-bleu: True
eval-bleu-args: {"beam": 5, "lenpen": 1.0, "prefix_size": 1}
eval-bleu-detok: moses
eval-bleu-remove-bpe: sentencepiece
eval-bleu-print-samples: True
best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True
# no-epoch-checkpoints: True
validate-interval: 4
keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
label-smoothing: 0.1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
ctc-weight: 0.3
share-ctc-and-embed: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
arch: s2t_dual
asr-encoder: transformer
mt-encoder-layers: 6
mt-encoder: transformer
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: join_speech_and_text_loss
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-asr-encoder-from:
#load-pretrained-mt-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
compression-metric: threshold
compression-mode: create
compression-layers: 6,9
compression-threshold: 0.99
compression-norm: True
compression-pos: True
\ No newline at end of file
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
# ctc-pae-ground-truth-ratio: 0.1
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-ctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
inter-ctc-mlo: 1:2:3
\ No newline at end of file
arch: s2t_multibranch
junior-acoustic-encoder: transformer
acoustic-adapter: none
textual-adapter: none
junior-acoustic-encoder-layers: 12
senior-acoustic-encoder-layers: 6
textual-encoder-layers: 6
# collaboration-direction: none
# collaboration-direction: acoustic
# collaboration-direction: textual
collaboration-direction: both
collaboration-start: 0:0
collaboration-step: 1:1
# encoder-collaboration-mode: serial
# decoder-collaboration-mode: serial
encoder-collaboration-mode: parallel
decoder-collaboration-mode: parallel
use-raw-text: False
modality-switch: False
text-to-hidden-progress: 1:0:500
encoder-league-s1-ratio: 0.5
encoder-league-s2-ratio: 0.5
encoder-league-out-norm: False
encoder-league-gated: False
encoder-league-drop-net: False
encoder-league-drop-net-prob: 0.2
encoder-league-drop-net-mix: False
decoder-league-s1-ratio: 0.5
decoder-league-s2-ratio: 0.5
decoder-league-out-norm: False
decoder-league-gated: False
decoder-league-drop-net: False
decoder-league-drop-net-prob: 0.0
decoder-league-drop-net-mix: False
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
encoder-embed-norm: True
encoder-no-scale-embedding: True
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-attention-heads: 4
decoder-layers: 6
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-junior-acoustic-encoder-from:
#load-pretrained-senior-acoustic-encoder-from:
#load-pretrained-textual-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: pds
adapter: none
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_m_32
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 4_4_4_4_4
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
\ No newline at end of file
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 512
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.1
inter-ctc-mixup-consistent-weight: 0.05
xctc-mixup-consistent-weight: 0.05
inter-xctc-mixup-consistent-weight: 0.25
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
share-ctc-and-embed: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
mixup-no-hard-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
share-ctc-and-embed: True
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
#xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
#inter-xctc-weight: 0.05
#inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
#xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
#xctc-pae-ground-truth-ratio: 0.1
#xctc-pae-ground-truth-only-mistake: True
#pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# CTC & XCTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.2
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
inter-xctc-weight: 0.1
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 3
cross-attn-layer: 2
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.5
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 4
cross-attn-layer: 3
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: transformer
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-ctc-and-embed: True
share-xctc-and-embed: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9,12,15
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Cross-layer attn
# xctc-cross-attn: True
# cross-attn-start-layer: 4
# cross-attn-layer: 3
# cross-attn-collaboration-mode: serial
# cross-attn-league-drop-net: True
# cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
ctc-weight: 0.3
share-ctc-and-embed: True
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
# adapter: none
# adapter-embed-norm: True
# adapter-out-norm: True
# load-pretrained-encoder-from:
# load-pretrained-acoustic-encoder-from:
# load-pretrained-text-encoder-from:
# load-pretrained-decoder-from:
\ No newline at end of file
encoder-attention-type: rel_pos
# encoder-attention-type: relative
# decoder-attention-type: relative
# max-encoder-relative-length: 100
# max-decoder-relative-length: 20
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 512
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: inter_league
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 3
pds-layers: 4_4_4
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5
pds-ffn-ratios: 8_8_8
pds-attn-heads: 4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
encoder-normalize-before: True
decoder-normalize-before: True
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: pds
adapter: inter_league
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 4_5_5_4
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_w2v2_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
w2v2-model-path: /home/xuchen/st/models/w2v2/wav2vec_small.pt
freeze-w2v: False
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
xctc-weight: 0.3
share-xctc-and-embed: True
\ No newline at end of file
inter-xctc-weight: 0.2
inter-xctc-layers: 6,9
xctc-pae: none
# xctc-pae: inter_league
xctc-cross-attn: False
cross-attn-start-layer: 7
cross-attn-layer: 6
cross-attn-collaboration-mode: parallel
cross-attn-league-s1-ratio: 0.5
cross-attn-league-s2-ratio: 0.5
cross-attn-league-out-norm: False
cross-attn-league-gated: False
cross-attn-league-drop-net: False
cross-attn-league-drop-net-prob: 0.2
cross-attn-league-drop-net-mix: False
# xctc-pae-ground-truth-ratio: 0.1
# xctc-pae-ground-truth-ratio-adaptive: True
# xctc-pae-ground-truth-only-mistake: True
# pae-oracle-smooth: True
# pae-gumbel: True
# pae-distribution-hard: True
# pae-drop-prob: 0.0
# pae-distribution-cutoff: 10
# share-pae-and-xctc: True
# pae-embed-norm: True
# pae-out-norm: True
# ctc-self-distill-weight: 1
# target-ctc-self-distill-weight: 1
# ctc-self-distill-prob: 0.1
# cal-all-ctc: True
\ No newline at end of file
#!/usr/bin/env bash
gpu_num=1
data_tag=st
test_subset=(tst-COMMON_en-de tst-COMMON_en-fr tst-COMMON_en-es tst-COMMON_en-it tst-COMMON_en-nl tst-COMMON_en-pt tst-COMMON_en-ro tst-COMMON_en-ru)
#test_subset=(tst-COMMON_en-de)
#test_subset=(test_en-fr_1k)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
ctc_infer=1
n_average=10
beam_size=5
infer_ctc_weight=0.1
len_penalty=1.0
max_tokens=20000
batch_size=1
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--ctc_infer ${ctc_infer}
--infer_ctc_weight ${infer_ctc_weight}
--infer_debug ${infer_debug}
"
if [[ -n ${data_tag} ]]; then
cmd="$cmd --data_tag ${data_tag}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
#!/usr/bin/env bash
set -e
ref=$1
gen=$2
tokenizer=$3
lang=$4
lang_pair=en-${lang}
record=$(mktemp -t temp.record.XXXXXX)
if [[ ${tokenizer} -eq 1 ]]; then
echo "MultiBLEU" > ${record}
cmd="multi-bleu.perl ${ref} < ${gen}"
eval $cmd | head -n 1 >> ${record}
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${ref} > ${ref}.detok"
eval $cmd
cmd="detokenizer.perl -q -l ${lang} --threads 32 < ${gen} > ${gen}.detok"
eval $cmd
ref=${ref}.detok
gen=${gen}.detok
fi
echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair} | jq -r .score"
eval $cmd >> ${record}
cat ${record}
rm ${record}
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
tokenizer=$6
lang=$7
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
if [[ ! -f ${ctc_infer_sort} ]]; then
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
fi
gen=${ctc_infer_sort}
./cal_bleu.sh ${ref} ${gen} ${tokenizer} ${lang}
\ No newline at end of file
import unicodedata
import jiwer
import jiwer.transforms as tr
import sys
ref_file = sys.argv[1]
hyp_file = sys.argv[2]
wer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
tr.ReduceToListOfListOfWords(),
]
)
cer_standardize = tr.Compose(
[
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ReduceToListOfListOfChars(),
]
)
ref_lines = open(ref_file, "r").readlines()
hyp_lines = open(hyp_file, "r").readlines()
wer = jiwer.wer(ref_lines, hyp_lines,
truth_transform=wer_standardize,
hypothesis_transform=wer_standardize,
)
cer = jiwer.cer(ref_lines, hyp_lines,
truth_transform=cer_standardize,
hypothesis_transform=cer_standardize,
)
print("WER: %.4f" % wer)
print("CER: %.4f" % cer)
#!/usr/bin/env bash
set -e
infer_dir=$1
tag=$2
s2s_infer_file=${infer_dir}/$3
org_ctc_infer_file=${infer_dir}/$4
ref=$5
idx=${infer_dir}/${tag}_idx
ctc_infer=${infer_dir}/${tag}_ctc_infer
ctc_infer_sort=${infer_dir}/${tag}_ctc_infer_sort
cut -f1 ${s2s_infer_file} > ${idx}
paste ${idx} ${org_ctc_infer_file} > ${ctc_infer}
sort -n -t $'\t' ${ctc_infer} | cut -f2 > ${ctc_infer_sort}
python3 ./cal_wer.py ${ref} ${ctc_infer_sort}
\ No newline at end of file
import sys
import csv
tsv_file = sys.argv[1]
out_file = sys.argv[2]
extract_item = sys.argv[3]
with open(tsv_file) as f:
reader = csv.DictReader(
f,
delimiter="\t",
quotechar=None,
doublequote=False,
lineterminator="\n",
quoting=csv.QUOTE_NONE,
)
samples = [dict(e) for e in reader]
fw = open(out_file, "w", encoding="utf-8")
for s in samples:
if extract_item in s:
fw.write("%s\n" % s[extract_item])
else:
print("Error in sample: ")
print(s)
exit()
#!/usr/bin/env bash
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
i=1
argv="$@"
while true; do
key=${!i}
j=$(($i + 1))
value=${!j}
[ -z "${!i:-}" ] && break; # break if there are no arguments
case "${key}" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '${key}}'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "${key}" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
#eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
# echo $name
eval $name=\"${value}\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "${value}" != "true" && "${value}" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": ${key} ${value}" 1>&2
exit 1;
fi
# shift 2;
i=$(($i + 2))
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# Processing uST-C Datasets
# Copyright 2021 Chen Xu (xuchennlp@outlook.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=1
stop_stage=2
######## Hardware ########
# Devices
device=(0)
gpu_num=8
update_freq=1
pwd_dir=$PWD
root_dir=${ST_ROOT}
data_root_dir=${root_dir}
code_dir=${root_dir}/S2T
# Dataset
src_lang=multi
tgt_lang=multi
lang=${src_lang}-${tgt_lang}
dataset=covost
data_tag=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=32000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
. ./local/parse_options.sh || exit 1;
use_specific_dict=0
specific_prefix=valid
specific_dir=${data_root_dir}/data/${dataset}/${lang}/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
data_model_subfix=${dataset}/${lang}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset}
data_dir=${data_root_dir}/data/${data_model_subfix}
lang_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "[a-z][a-z]-[a-z][a-z]" -printf '%P,')
languages=$(echo "$lang_list" | sed 's/,$//')
train_split=train
valid_split=dev
test_split=test
test_list=$(find "$org_data_dir" -maxdepth 1 -type d -name "[a-z][a-z]-[a-z][a-z]" -exec basename {} \; | awk -v prefix="test_" '{print prefix $0}')
IFS=','
test_subset="${test_list[*]}"
IFS=$' \t\n'
# Exp
sub_tag=
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# Training Settings
train_config=base,ctc
fp16=1
max_tokens=20000
step_valid=0
bleu_valid=0
# Decoding Settings
batch_size=0
sacrebleu=1
dec_model=checkpoint_best.pt
ctc_infer=0
infer_ctc_weight=0
n_average=10
beam_size=5
len_penalty=1.0
epoch_ensemble=0
best_ensemble=1
infer_debug=0
infer_score=0
infer_tag=
infer_parameter=
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml
else
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
if [[ -n ${exp_subfix} ]]; then
exp_name=${exp_name}_${exp_subfix}
fi
fi
ckpt_dir=${root_dir}/checkpoints/
model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
# Start
cd ${code_dir}
echo "Start Stage: $stage"
echo "Stop Stage: $stop_stage"
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "Default Stage: env configure"
pip3 install -e ${code_dir}
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Stage -1: Data Download"
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "Stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
# create ASR vocabulary if necessary
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
if [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
mkdir -p ${data_dir}/asr4st
eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir}
fi
echo "Stage 0: ST Data Preparation"
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--joint
--add-src
--languages ${languages}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Stage 1: Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
echo -e "data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir}
train_config=basis,${train_config}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp -f ${config_path} ${model_dir}
if [[ $idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=${ckpt_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then
# tensorboard
port=6666
tensorboard --logdir ${model_dir} --port ${port} --bind_all &
echo "${cmd}" > ${model_dir}/cmd
eval $cmd
#sleep 2s
#tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Stage 2: Decoding"
dec_models=
if [[ ${n_average} -eq 1 ]]; then
dec_models=${dec_model}
fi
if [[ ${n_average} -ne 1 ]]; then
# Average models
if [[ ${epoch_ensemble} -eq 1 ]]; then
avg_model=avg_epoch${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
if [[ ${best_ensemble} -eq 1 ]]; then
avg_model=avg_best${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
export CUDA_VISIBLE_DEVICES=${device}
fi
for dec_model in ${dec_models[@]}; do
suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str}
if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu
else
suffix=${suffix}_multibleu
fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${ctc_infer} -eq 1 ]]; then
suffix=${suffix}_ctc
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
suffix=${suffix}_ensemble
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
suffix=${suffix}_logit${ctc_inter_logit}
fi
if (( $(echo "${infer_ctc_weight} > 0" | bc -l) )); then
suffix=${suffix}_ctc${infer_ctc_weight}
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
if [[ -n ${infer_tag} ]]; then
suffix=${suffix}_${infer_tag}
fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens}
--beam ${beam_size}
--prefix-size 1
--skip-invalid-size-inputs-valid-test
--infer-ctc-weight ${infer_ctc_weight}
--lenpen ${len_penalty}"
if [[ ${ctc_infer} -eq 1 ]]; then
cmd="${cmd}
--ctc-infer"
fi
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ "${tgt_lang}" = "ja" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer ja-mecab"
elif [[ "${tgt_lang}" == "zh" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer zh"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--source-lang ${src_lang}
--target-lang ${tgt_lang}"
fi
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
cmd="${cmd}
--ctc-self-ensemble"
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
cmd="${cmd}
--ctc-inter-logit ${ctc_inter_logit}"
fi
if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd}
--score-reference"
fi
if [[ -n ${infer_parameter} ]]; then
cmd="${cmd}
${infer_parameter}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
cd ${code_dir}
if [[ $eval -eq 1 ]]; then
ctc_file=translation-${subset}.ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
rm ${model_dir}/${ctc_file}
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
rm ${model_dir}/${xctc_file}
fi
eval $cmd
echo "" >> ${result_file}
tail -n 2 ${model_dir}/generate-${subset}.txt >> ${result_file}
mv ${model_dir}/generate-${subset}.txt ${model_dir}/generate-${subset}-${suffix}.txt
mv ${model_dir}/translation-${subset}.txt ${model_dir}/translation-${subset}-${suffix}.txt
cd ${pwd_dir}
if [[ -f ${model_dir}/enc_dump ]]; then
mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
fi
if [[ -f ${model_dir}/dec_dump ]]; then
mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
fi
trans_file=translation-${subset}-${suffix}.txt
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${src_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
fi
if [[ -f ${ref_file} ]]; then
ctc=$(mktemp -t temp.record.XXXXXX)
cd ./local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} > ${ctc}
cd ..
echo "CTC WER" >> ${result_file}
tail -n 2 ${ctc} >> ${result_file}
src_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
cd ..
cat ${src_bleu} >> ${result_file}
rm ${ctc} ${src_bleu}
else
echo "No reference for source language."
fi
fi
xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
ref_file=${model_dir}/${subset}.${tgt_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text"
fi
if [[ -f ${ref_file} ]]; then
xctc=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} > ${xctc}
cd ..
echo "XCTC WER" >> ${result_file}
tail -n 2 ${xctc} >> ${result_file}
tgt_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu}
cd ..
cat ${tgt_bleu} >> ${result_file}
rm ${xctc} ${tgt_bleu}
else
echo "No reference for target language."
fi
fi
fi
done
echo
echo "" >> ${result_file}
cat ${result_file}
done
fi
#!/usr/bin/env bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
# Base
#config_list=(base conformer ctc)
# SATE
config_list=(sate conformer ctc)
# PDS
#config_list=(pds_base_8 conformer ctc)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 2
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
...@@ -2,8 +2,7 @@ ...@@ -2,8 +2,7 @@
# Processing MuST-C Datasets # Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory # Copyright 2021 Chen Xu (xuchennlp@outlook.com)
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on : # Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', # -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
...@@ -16,626 +15,21 @@ eval=1 ...@@ -16,626 +15,21 @@ eval=1
time=$(date "+%m%d_%H%M") time=$(date "+%m%d_%H%M")
stage=1 stage=1
stop_stage=4 stop_stage=2
######## hardware ######## ######## Hardware ########
# devices # Devices
device=(0) device=(0)
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
hdfs_get=0
root_dir=/opt/tiger
data_root_dir=/mnt/bn/nas-xc-1
code_dir=${root_dir}/s2t
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
dataset=covost
data_tag=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
. ./local/parse_options.sh || exit 1;
lang=${src_lang}-${tgt_lang}
use_specific_dict=0
specific_prefix=valid
specific_dir=${data_root_dir}/data/${dataset}/${lang}/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
data_model_subfix=${dataset}/${lang}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset}
data_dir=${data_root_dir}/data/${data_model_subfix}
train_split=train
valid_split=dev
test_split=test
test_subset=dev,test
# exp
sub_tag=
exp_prefix=$(date "+%m%d")
# exp_subfix=${ARNOLD_JOB_ID}_${ARNOLD_TASK_ID}_${ARNOLD_TRIAL_ID}
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=sate,ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt
ctc_infer=0
n_average=10
beam_size=5
len_penalty=1.0
infer_score=0
infer_parameters=
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st.yaml
else
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
if [[ ! -d /mnt/bd/data-model && -d /mnt/bd/data-model2 ]]; then
sudo ln -s /mnt/bd/data-model2/ /mnt/bd/data-model
fi
# setup nccl envs
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=$ARNOLD_RDMA_DEVICE:1
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
HOSTS=$ARNOLD_WORKER_HOSTS
HOST=(${HOSTS//,/ })
HOST_SPLIT=(${HOST//:/ })
PORT=${HOST_SPLIT[1]}
INIT_METHOD="tcp://${ARNOLD_WORKER_0_HOST}:${ARNOLD_WORKER_0_PORT}"
DIST_RANK=$((ARNOLD_ID * ARNOLD_WORKER_GPU))
export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
if [[ -n ${exp_subfix} ]]; then
exp_name=${exp_name}_${exp_subfix}
fi
fi
model_dir=${code_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
echo "stage: $stage"
echo "stop_stage: $stop_stage"
cd ${code_dir}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
# create ASR vocabulary if necessary
cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
if [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
mkdir -p ${data_dir}/asr4st
eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir}
fi
echo "stage 0: ST Data Preparation"
cmd="python3 ${code_dir}/examples/speech_to_text/prep_covost_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [[ ! -d ${data_dir} ]]; then
echo "No feature dir ${data_dir}"
exit
fi
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "default stage: env configure"
pip3 install -e ${code_dir} -i https://bytedpypi.byted.org/simple --no-build-isolation --default-timeout=10000
fi
if [[ -d /mnt/bn/nas-xc-1/checkpoints && ! -d ${code_dir}/checkpoints ]]; then
ln -s /mnt/bn/nas-xc-1/checkpoints ${code_dir}
fi
# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [ ${hdfs_get} -eq 1 ]; then
ln_data_dir=`echo ${data_dir} | sed -e "s#${data_root_dir}#${code_dir}#"`
echo ${ln_data_dir}
mkdir -p ${ln_data_dir}
ln -s ${data_dir}/../* ${ln_data_dir}
rm -r ${ln_data_dir}
hdfs_path=`echo ${data_dir} | sed -e "s#${data_root_dir}#hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/#"`
hdfs dfs -get ${hdfs_path} ${ln_data_dir}
sed -i -e "s#${data_root_dir}#${code_dir}#" ${ln_data_dir}/config*
data_dir=${ln_data_dir}
fi
# fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp -f ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=1
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp -f ${config_path} ${model_dir}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
if [[ ${DIST_RANK} -ne 0 ]]; then
cmd="${cmd}
--distributed-init-method ${INIT_METHOD}
--distributed-rank ${DIST_RANK}"
fi
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=${model_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
# export CUDA_VISIBLE_DEVICES=${device}
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then
# tensorboard
if [[ -z ${ARNOLD_TENSORBOARD_CURRENT_PORT} ]]; then
port=6666
else
port=${ARNOLD_TENSORBOARD_CURRENT_PORT}
fi
tensorboard --logdir ${model_dir} --port ${port} --bind_all &
echo "${cmd}" > ${model_dir}/cmd
eval $cmd
#sleep 2s
#tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
# export CUDA_VISIBLE_DEVICES=${device}
suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
if [[ ${n_average} -ne 1 ]]; then
suffix=${suffix}_${n_average}
fi
if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu
else
suffix=${suffix}_multibleu
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python3 ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--skip-invalid-size-inputs-valid-test
--lenpen ${len_penalty}"
if [[ ${ctc_infer} -eq 1 ]]; then
cmd="${cmd}
--ctc-infer"
fi
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ "${tgt_lang}" = "ja" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer ja-mecab"
elif [[ "${tgt_lang}" == "zh" ]]; then
cmd="${cmd}
--sacrebleu-tokenizer zh"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--source-lang ${src_lang}
--target-lang ${tgt_lang}"
fi
fi
if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd}
--score-reference"
fi
if [[ -n ${infer_parameters} ]]; then
cmd="${cmd}
${infer_parameters}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
cd ${code_dir}
if [[ $eval -eq 1 ]]; then
src_ctc_file=translation-${subset}.txt.src_ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${src_ctc_file} ]]; then
rm ${model_dir}/${src_ctc_file}
elif [[ ${ctc_infer} -eq 1 && -f ${model_dir}/translation-${subset}.txt.ctc ]]; then
src_ctc_file=translation-${subset}.txt.ctc
rm ${model_dir}/${src_ctc_file}
fi
tgt_ctc_file=translation-${subset}.txt.tgt_ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${tgt_ctc_file} ]]; then
rm ${model_dir}/${tgt_ctc_file}
fi
eval $cmd
echo "" >> ${result_file}
tail -n 2 ${model_dir}/generate-${subset}.txt >> ${result_file}
mv ${model_dir}/generate-${subset}.txt ${model_dir}/generate-${subset}-${suffix}.txt
mv ${model_dir}/translation-${subset}.txt ${model_dir}/translation-${subset}-${suffix}.txt
cd ${pwd_dir}
if [[ -f ${model_dir}/enc_dump ]]; then
mv ${model_dir}/enc_dump ${model_dir}/${subset}-${suffix}-enc-dump
fi
if [[ -f ${model_dir}/dec_dump ]]; then
mv ${model_dir}/dec_dump ${model_dir}/${subset}-${suffix}-dec-dump
fi
trans_file=translation-${subset}-${suffix}.txt
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${src_ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${src_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
fi
if [[ -f ${ref_file} ]]; then
src_ctc=$(mktemp -t temp.record.XXXXXX)
cd ./local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} > ${src_ctc}
cd ..
echo "Source language" >> ${result_file}
echo "CTC WER" >> ${result_file}
tail -n 2 ${src_ctc} >> ${result_file}
src_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
cd ..
cat ${src_bleu} >> ${result_file}
rm ${src_ctc} ${src_bleu}
else
echo "No reference for source language."
fi
fi
tgt_ctc_file=translation-${subset}.txt.tgt_ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${tgt_ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${tgt_lang}
if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text"
fi
if [[ -f ${ref_file} ]]; then
tgt_ctc=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${tgt_ctc_file} ${ref_file} > ${tgt_ctc}
cd ..
echo "Target language" >> ${result_file}
echo "CTC WER" >> ${result_file}
tail -n 2 ${tgt_ctc} >> ${result_file}
tgt_bleu=$(mktemp -t temp.record.XXXXXX)
cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${tgt_ctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu}
cd ..
cat ${tgt_bleu} >> ${result_file}
rm ${tgt_ctc} ${tgt_bleu}
else
echo "No reference for target language."
fi
fi
fi
done
echo
cat ${result_file}
fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# cd ${fairseq_dir}
# echo "Stage 4: Upload model and log"
# echo "Path: hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}/${exp_name}"
# hdfs dfs -mkdir -p hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# hdfs dfs -put -f ${model_dir} hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# fi
#!/usr/bin/env bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=1
stop_stage=4
######## hardware ########
# devices
device=(0)
gpu_num=8
update_freq=1
hdfs_get=0
root_dir=/opt/tiger
data_root_dir=/mnt/bn/nas-xc-1
code_dir=${root_dir}/s2t
pwd_dir=$PWD pwd_dir=$PWD
root_dir=${ST_ROOT}
data_root_dir=${root_dir}
# dataset code_dir=${root_dir}/S2T
# Dataset
src_lang=en src_lang=en
tgt_lang=de tgt_lang=de
dataset=covost dataset=covost
...@@ -661,7 +55,7 @@ asr_vocab_prefix=spm_unigram10000_st_share ...@@ -661,7 +55,7 @@ asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share st_vocab_prefix=spm_unigram10000_st_share
data_model_subfix=${dataset}/${lang}/${data_tag} data_model_subfix=${dataset}/${lang}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset} org_data_dir=${data_root_dir}/data/${dataset}/${lang}
data_dir=${data_root_dir}/data/${data_model_subfix} data_dir=${data_root_dir}/data/${data_model_subfix}
train_split=train train_split=train
...@@ -669,34 +63,36 @@ valid_split=dev ...@@ -669,34 +63,36 @@ valid_split=dev
test_split=test test_split=test
test_subset=dev,test test_subset=dev,test
# exp # Exp
sub_tag= sub_tag=
exp_prefix=$(date "+%m%d") exp_prefix=$(date "+%m%d")
# exp_subfix=${ARNOLD_JOB_ID}_${ARNOLD_TASK_ID}_${ARNOLD_TRIAL_ID}
extra_tag= extra_tag=
extra_parameter= extra_parameter=
exp_tag=baseline exp_tag=baseline
exp_name= exp_name=
# config # Training Settings
train_config=sate,ctc train_config=base,ctc
# training setting
fp16=1 fp16=1
max_tokens=40000 max_tokens=40000
step_valid=0 step_valid=0
bleu_valid=0 bleu_valid=0
# decoding setting # Decoding Settings
batch_size=1
sacrebleu=1 sacrebleu=1
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
ctc_infer=0 ctc_infer=0
infer_ctc_weight=0
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
infer_debug=0
infer_score=0 infer_score=0
infer_parameters= #infer_parameters=" --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options
if [[ ${share_dict} -eq 1 ]]; then if [[ ${share_dict} -eq 1 ]]; then
data_config=config_share.yaml data_config=config_share.yaml
else else
...@@ -722,23 +118,6 @@ if [[ ${use_raw_audio} -eq 1 ]]; then ...@@ -722,23 +118,6 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw exp_prefix=${exp_prefix}_raw
fi fi
if [[ ! -d /mnt/bd/data-model && -d /mnt/bd/data-model2 ]]; then
sudo ln -s /mnt/bd/data-model2/ /mnt/bd/data-model
fi
# setup nccl envs
export NCCL_IB_DISABLE=0
export NCCL_IB_HCA=$ARNOLD_RDMA_DEVICE:1
export NCCL_IB_GID_INDEX=3
export NCCL_SOCKET_IFNAME=eth0
HOSTS=$ARNOLD_WORKER_HOSTS
HOST=(${HOSTS//,/ })
HOST_SPLIT=(${HOST//:/ })
PORT=${HOST_SPLIT[1]}
INIT_METHOD="tcp://${ARNOLD_WORKER_0_HOST}:${ARNOLD_WORKER_0_PORT}"
DIST_RANK=$((ARNOLD_ID * ARNOLD_WORKER_GPU))
export PATH=$PATH:${code_dir}/scripts export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
...@@ -752,21 +131,27 @@ if [[ -z ${exp_name} ]]; then ...@@ -752,21 +131,27 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${exp_subfix} exp_name=${exp_name}_${exp_subfix}
fi fi
fi fi
model_dir=${code_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
echo "stage: $stage" ckpt_dir=${root_dir}/checkpoints/
echo "stop_stage: $stop_stage" model_dir=${root_dir}/checkpoints/${data_model_subfix}/${sub_tag}/${exp_name}
# Start
cd ${code_dir} cd ${code_dir}
echo "Start Stage: $stage"
echo "Stop Stage: $stop_stage"
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "Default Stage: env configure"
pip3 install -e ${code_dir}
fi
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "Stage -1: Data Download"
# pass
fi fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself. ### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases echo "Stage 0: ASR Data Preparation"
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
...@@ -798,14 +183,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -798,14 +183,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir} cp -f ${data_dir}/asr4st/${asr_prefix}* ${data_dir}
fi fi
echo "stage 0: ST Data Preparation" echo "Stage 0: ST Data Preparation"
cmd="python3 ${code_dir}/examples/speech_to_text/prep_covost_data.py cmd="python3 ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--task st --task st
--add-src --add-src
--src-lang ${src_lang} --src-lang ${src_lang}
--tgt-lang ${tgt_lang} --tgt-lang ${tgt_lang}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance --cmvn-type utterance
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
...@@ -853,37 +239,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -853,37 +239,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
fi fi
if [[ ! -d ${data_dir} ]]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "No feature dir ${data_dir}" echo "Stage 1: Network Training"
exit
fi
if [[ `pip list | grep fairseq | wc -l` -eq 0 ]]; then
echo "default stage: env configure"
pip3 install -e ${code_dir} -i https://bytedpypi.byted.org/simple --no-build-isolation --default-timeout=10000
fi
if [[ -d /mnt/bn/nas-xc-1/checkpoints && ! -d ${code_dir}/checkpoints ]]; then
ln -s /mnt/bn/nas-xc-1/checkpoints ${code_dir}
fi
# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [ ${hdfs_get} -eq 1 ]; then
ln_data_dir=`echo ${data_dir} | sed -e "s#${data_root_dir}#${code_dir}#"`
echo ${ln_data_dir}
mkdir -p ${ln_data_dir}
ln -s ${data_dir}/../* ${ln_data_dir}
rm -r ${ln_data_dir}
hdfs_path=`echo ${data_dir} | sed -e "s#${data_root_dir}#hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/#"`
hdfs dfs -get ${hdfs_path} ${ln_data_dir}
sed -i -e "s#${data_root_dir}#${code_dir}#" ${ln_data_dir}/config*
data_dir=${ln_data_dir}
fi
# fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1; [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
...@@ -893,6 +250,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -893,6 +250,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
fi fi
export CUDA_VISIBLE_DEVICES=${device}
fi fi
echo -e "data=${data_dir} model=${model_dir}" echo -e "data=${data_dir} model=${model_dir}"
...@@ -946,11 +304,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -946,11 +304,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmd="${cmd} cmd="${cmd}
--distributed-world-size $gpu_num --distributed-world-size $gpu_num
--ddp-backend no_c10d" --ddp-backend no_c10d"
if [[ ${DIST_RANK} -ne 0 ]]; then
cmd="${cmd}
--distributed-init-method ${INIT_METHOD}
--distributed-rank ${DIST_RANK}"
fi
fi fi
if [[ $fp16 -eq 1 ]]; then if [[ $fp16 -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
...@@ -996,22 +349,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -996,22 +349,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info # save info
log=${model_dir}/history.log log=${ckpt_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log tail -n 50 ${log} > tmp.log
mv tmp.log $log mv tmp.log $log
# export CUDA_VISIBLE_DEVICES=${device}
log=${model_dir}/train.log log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}" cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="nohup ${cmd} >> ${log} 2>&1 &" #cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
# tensorboard # tensorboard
if [[ -z ${ARNOLD_TENSORBOARD_CURRENT_PORT} ]]; then
port=6666 port=6666
else
port=${ARNOLD_TENSORBOARD_CURRENT_PORT}
fi
tensorboard --logdir ${model_dir} --port ${port} --bind_all & tensorboard --logdir ${model_dir} --port ${port} --bind_all &
echo "${cmd}" > ${model_dir}/cmd echo "${cmd}" > ${model_dir}/cmd
...@@ -1021,8 +369,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -1021,8 +369,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
fi fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 3: ST Decoding" echo "Stage 2: Decoding"
if [[ ${n_average} -ne 1 ]]; then if [[ ${n_average} -ne 1 ]]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
...@@ -1046,37 +394,63 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -1046,37 +394,63 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
source ./local/utils.sh source ./local/utils.sh
device=$(get_devices $gpu_num 0) device=$(get_devices $gpu_num 0)
fi fi
export CUDA_VISIBLE_DEVICES=${device}
fi fi
# export CUDA_VISIBLE_DEVICES=${device}
suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens} suffix=alpha${len_penalty}
if [[ ${n_average} -ne 1 ]]; then model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${n_average} suffix=${suffix}_${model_str}
fi
if [[ ${sacrebleu} -eq 1 ]]; then if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu suffix=${suffix}_sacrebleu
else else
suffix=${suffix}_multibleu suffix=${suffix}_multibleu
fi fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${ctc_infer} -eq 1 ]]; then
suffix=${suffix}_ctc
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
suffix=${suffix}_ensemble
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
suffix=${suffix}_logit${ctc_inter_logit}
fi
if (( $(echo "${infer_ctc_weight} > 0" | bc -l) )); then
suffix=${suffix}_ctc${infer_ctc_weight}
fi
if [[ ${infer_score} -eq 1 ]]; then if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score suffix=${suffix}_score
fi fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix} result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ } test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset} subset=${subset}
cmd="python3 ${code_dir}/fairseq_cli/generate.py if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--gen-subset ${subset} --gen-subset ${subset}
--task speech_to_text --task speech_to_text
--path ${model_dir}/${dec_model} --path ${model_dir}/${dec_model}
--results-path ${model_dir} --results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--skip-invalid-size-inputs-valid-test --skip-invalid-size-inputs-valid-test
--infer-ctc-weight ${infer_ctc_weight}
--lenpen ${len_penalty}" --lenpen ${len_penalty}"
if [[ ${ctc_infer} -eq 1 ]]; then if [[ ${ctc_infer} -eq 1 ]]; then
...@@ -1100,6 +474,14 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -1100,6 +474,14 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--target-lang ${tgt_lang}" --target-lang ${tgt_lang}"
fi fi
fi fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
cmd="${cmd}
--ctc-self-ensemble"
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
cmd="${cmd}
--ctc-inter-logit ${ctc_inter_logit}"
fi
if [[ ${infer_score} -eq 1 ]]; then if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
--score-reference" --score-reference"
...@@ -1113,16 +495,13 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -1113,16 +495,13 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
cd ${code_dir} cd ${code_dir}
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
src_ctc_file=translation-${subset}.txt.src_ctc ctc_file=translation-${subset}.ctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${src_ctc_file} ]]; then if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
rm ${model_dir}/${src_ctc_file} rm ${model_dir}/${ctc_file}
elif [[ ${ctc_infer} -eq 1 && -f ${model_dir}/translation-${subset}.txt.ctc ]]; then
src_ctc_file=translation-${subset}.txt.ctc
rm ${model_dir}/${src_ctc_file}
fi fi
tgt_ctc_file=translation-${subset}.txt.tgt_ctc xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${tgt_ctc_file} ]]; then if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
rm ${model_dir}/${tgt_ctc_file} rm ${model_dir}/${xctc_file}
fi fi
eval $cmd eval $cmd
...@@ -1133,62 +512,61 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -1133,62 +512,61 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
cd ${pwd_dir} cd ${pwd_dir}
if [[ -f ${model_dir}/enc_dump ]]; then if [[ -f ${model_dir}/enc_dump ]]; then
mv ${model_dir}/enc_dump ${model_dir}/${subset}-${suffix}-enc-dump mv ${model_dir}/enc_dump ${model_dir}/dump-${subset}-enc-${suffix}
fi fi
if [[ -f ${model_dir}/dec_dump ]]; then if [[ -f ${model_dir}/dec_dump ]]; then
mv ${model_dir}/dec_dump ${model_dir}/${subset}-${suffix}-dec-dump mv ${model_dir}/dec_dump ${model_dir}/dump-${subset}-dec-${suffix}
fi fi
trans_file=translation-${subset}-${suffix}.txt trans_file=translation-${subset}-${suffix}.txt
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${src_ctc_file} ]]; then if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${ctc_file} ]]; then
ref_file=${model_dir}/${subset}.${src_lang} ref_file=${model_dir}/${subset}.${src_lang}
if [[ ! -f ${ref_file} ]]; then if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text" python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "src_text"
fi fi
if [[ -f ${ref_file} ]]; then if [[ -f ${ref_file} ]]; then
src_ctc=$(mktemp -t temp.record.XXXXXX) ctc=$(mktemp -t temp.record.XXXXXX)
cd ./local cd ./local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} > ${src_ctc} ./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} > ${ctc}
cd .. cd ..
echo "Source language" >> ${result_file}
echo "CTC WER" >> ${result_file} echo "CTC WER" >> ${result_file}
tail -n 2 ${src_ctc} >> ${result_file} tail -n 2 ${ctc} >> ${result_file}
src_bleu=$(mktemp -t temp.record.XXXXXX) src_bleu=$(mktemp -t temp.record.XXXXXX)
cd local cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${src_ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu} ./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${ctc_file} ${ref_file} ${tokenizer} ${src_lang} > ${src_bleu}
cd .. cd ..
cat ${src_bleu} >> ${result_file} cat ${src_bleu} >> ${result_file}
rm ${src_ctc} ${src_bleu} rm ${ctc} ${src_bleu}
else else
echo "No reference for source language." echo "No reference for source language."
fi fi
fi fi
tgt_ctc_file=translation-${subset}.txt.tgt_ctc xctc_file=translation-${subset}.xctc
if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${tgt_ctc_file} ]]; then if [[ ${ctc_infer} -eq 1 && -f ${model_dir}/${xctc_file} ]]; then
ref_file=${model_dir}/${subset}.${tgt_lang} ref_file=${model_dir}/${subset}.${tgt_lang}
if [[ ! -f ${ref_file} ]]; then if [[ ! -f ${ref_file} ]]; then
python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text" python3 ./local/extract_txt_from_tsv.py ${data_dir}/${subset}.tsv ${ref_file} "tgt_text"
fi fi
if [[ -f ${ref_file} ]]; then if [[ -f ${ref_file} ]]; then
tgt_ctc=$(mktemp -t temp.record.XXXXXX) xctc=$(mktemp -t temp.record.XXXXXX)
cd local cd local
./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${tgt_ctc_file} ${ref_file} > ${tgt_ctc} ./cal_wer.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} > ${xctc}
cd .. cd ..
echo "Target language" >> ${result_file} echo "XCTC WER" >> ${result_file}
echo "CTC WER" >> ${result_file} tail -n 2 ${xctc} >> ${result_file}
tail -n 2 ${tgt_ctc} >> ${result_file}
tgt_bleu=$(mktemp -t temp.record.XXXXXX) tgt_bleu=$(mktemp -t temp.record.XXXXXX)
cd local cd local
./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${tgt_ctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu} ./cal_ctc_bleu.sh ${model_dir} ${subset} ${trans_file} ${xctc_file} ${ref_file} ${tokenizer} ${tgt_lang} > ${tgt_bleu}
cd .. cd ..
cat ${tgt_bleu} >> ${result_file} cat ${tgt_bleu} >> ${result_file}
rm ${tgt_ctc} ${tgt_bleu} rm ${xctc} ${tgt_bleu}
else else
echo "No reference for target language." echo "No reference for target language."
fi fi
...@@ -1198,11 +576,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -1198,11 +576,3 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo echo
cat ${result_file} cat ${result_file}
fi fi
# if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# cd ${fairseq_dir}
# echo "Stage 4: Upload model and log"
# echo "Path: hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}/${exp_name}"
# hdfs dfs -mkdir -p hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# hdfs dfs -put -f ${model_dir} hdfs://haruna/home/byte_arnold_lq_mlnlc/user/xuchen/s2t/checkpoints/${data_model_subfix}
# fi
arch: s2t_ctc
encoder-type: pds
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 2_2_6_3_3
pds-ratios: 2_2_2_2_0
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 4
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_4_8_4
pds-ratios: 2_2_2_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
dropout: 0.1
activation-fn: relu
encoder-layers: 18
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_ctc
encoder-type: pds
encoder-embed-dim: 256
pds-stages: 5
pds-layers: 3_3_4_4_4
pds-ratios: 2_2_2_0_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 4
#load-pretrained-encoder-from:
arch: s2t_ctc
encoder-type: pds
encoder-embed-dim: 256
pds-stages: 4
pds-layers: 2_6_6_4
pds-ratios: 2_2_2_0
pds-fusion: False
pds-fusion-method: all_conv2
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 18
encoder-attention-heads: 4
#load-pretrained-encoder-from:
dir=early_exit/purectc_pds_base_8_ee_conformer_layer36_interw1_x2_256_ds0_pdrop0_334 dir=$1
vocab=256 vocab=$2
#dir=early_exit/purectc_pds_base_8_ee_conformer_layer36_interw1_x2_4k_ds0_pdrop0_334 #dir=early_exit/purectc_pds_base_8_ee_conformer_layer36_interw1_x2_4k_ds0_pdrop0_334
#vocab=4k #vocab=4k
data_tag=asr
data=config_${vocab}.yaml data=config_${vocab}.yaml
./run.sh --stage 2 --data_config ${data} --exp_name $dir ./run.sh --stage 2 --data_tag ${data_tag} --data_config ${data} --exp_name $dir --max_tokens 40000
#./run.sh --stage 2 --data_config ${data} --exp_name $dir
#./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee4 --infer_parameter "--early-exit-count 4" #./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee4 --infer_parameter "--early-exit-count 4"
#./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee6 --infer_parameter "--early-exit-count 6" #./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee6 --infer_parameter "--early-exit-count 6"
#./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee8 --infer_parameter "--early-exit-count 8" #./run.sh --stage 2 --data_config ${data} --exp_name $dir --infer_tag ee8 --infer_parameter "--early-exit-count 8"
......
set -e
./pipe2.sh pds/enc_dec/base_conformer_256_conv15 256
./pipe2.sh pds/enc_dec/base_conformer_4k_conv15 4k
./pipe2.sh pds/enc_dec/base_conformer_32k_conv15 32k
#./pipe2.sh pds/purectc/purectc_conformer_256 256
#./pipe2.sh pds/purectc/purectc_conformer_4k 4k
#./pipe2.sh pds/purectc/purectc_conformer_32k 32k
...@@ -54,6 +54,8 @@ data_model_subfix=${dataset}/${data_tag} ...@@ -54,6 +54,8 @@ data_model_subfix=${dataset}/${data_tag}
org_data_dir=${data_root_dir}/data/${dataset} org_data_dir=${data_root_dir}/data/${dataset}
data_dir=${data_root_dir}/data/${data_model_subfix} data_dir=${data_root_dir}/data/${data_model_subfix}
test_subset=dev-clean,dev-other,test-clean,test-other,all test_subset=dev-clean,dev-other,test-clean,test-other,all
#test_subset=test-other
#test_subset=flops
# exp sub_tag= # exp sub_tag=
exp_prefix=$(date "+%m%d") exp_prefix=$(date "+%m%d")
...@@ -75,20 +77,20 @@ infer_ctc_weight=0 ...@@ -75,20 +77,20 @@ infer_ctc_weight=0
ctc_self_ensemble=0 ctc_self_ensemble=0
ctc_inter_logit=0 ctc_inter_logit=0
n_average=10 n_average=10
batch_size=1 batch_size=0
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
single=0 single=0
epoch_ensemble=0 epoch_ensemble=1
best_ensemble=1 best_ensemble=1
infer_debug=0 infer_debug=0
infer_score=0 infer_score=0
infer_tag= infer_tag=
infer_parameter= infer_parameter=
#infer_tag=ee3 #infer_tag=ee3
#infer_parameter="--cal-flops True"
#infer_parameter="--early-exit-count 3" #infer_parameter="--early-exit-count 3"
#infer_parameter="--early-exit-layer 12" #infer_parameter="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.05 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
#infer_parameter="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
data_config=config.yaml data_config=config.yaml
......
...@@ -14,7 +14,7 @@ fi ...@@ -14,7 +14,7 @@ fi
sacrebleu=1 sacrebleu=1
ctc_infer=0 ctc_infer=0
n_average=1 n_average=10
beam_size=5 beam_size=5
infer_ctc_weight=0 infer_ctc_weight=0
len_penalty=1.0 len_penalty=1.0
......
#train-subset: train_en-de,train_en-fr,train_en-es,train_en-it,train_en-nl,train_en-pt,train_en-ro,train_en-ru
train-subset: train_mix_1,train_mix_2,train_mix_3,train_mix_4,train_mix_5,train_mix_6,train_mix_7,train_mix_8 train-subset: train_mix_1,train_mix_2,train_mix_3,train_mix_4,train_mix_5,train_mix_6,train_mix_7,train_mix_8
#valid-subset: dev_en-de,dev_en-fr,dev_en-es,dev_en-it,dev_en-nl,dev_en-pt,dev_en-ro,dev_en-ru #valid-subset: dev_en-de,dev_en-fr,dev_en-es,dev_en-it,dev_en-nl,dev_en-pt,dev_en-ro,dev_en-ru
valid-subset: dev_en-de valid-subset: dev_en-de
#train-subset: train_en-de,train_en-fr,train_en-es,train_en-it
#valid-subset: dev_en-de,dev_en-fr,dev_en-es,dev_en-it
ignore-prefix-size: 1 ignore-prefix-size: 1
prefix-size: 1
#sharded-data-load: True sharded-data-load: True
max-epoch: 300 max-epoch: 300
max-update: 300000 max-update: 300000
patience: 20 patience: 20
......
...@@ -2,9 +2,9 @@ ...@@ -2,9 +2,9 @@
gpu_num=1 gpu_num=1
data_tag=stp data_tag=st
test_subset=(tst-COMMON_en-de tst-COMMON_en-fr tst-COMMON_en-es tst-COMMON_en-it tst-COMMON_en-nl tst-COMMON_en-pt tst-COMMON_en-ro tst-COMMON_en-ru) test_subset=(tst-COMMON_en-de tst-COMMON_en-fr tst-COMMON_en-es tst-COMMON_en-it tst-COMMON_en-nl tst-COMMON_en-pt tst-COMMON_en-ro tst-COMMON_en-ru)
test_subset=(tst-COMMON_en-de) #test_subset=(tst-COMMON_en-de)
#test_subset=(test_en-fr_1k) #test_subset=(test_en-fr_1k)
exp_name= exp_name=
...@@ -13,13 +13,13 @@ if [ "$#" -eq 1 ]; then ...@@ -13,13 +13,13 @@ if [ "$#" -eq 1 ]; then
fi fi
sacrebleu=1 sacrebleu=1
ctc_infer=0 ctc_infer=1
n_average=10 n_average=10
beam_size=5 beam_size=5
infer_ctc_weight=0 infer_ctc_weight=0.1
len_penalty=1.0 len_penalty=1.0
max_tokens=20000 max_tokens=20000
batch_size=0 batch_size=1
infer_debug=0 infer_debug=0
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
......
...@@ -23,7 +23,7 @@ if [[ ${tokenizer} -eq 1 ]]; then ...@@ -23,7 +23,7 @@ if [[ ${tokenizer} -eq 1 ]]; then
fi fi
echo "SacreBLEU" >> ${record} echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair}" cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair} | jq -r .score"
eval $cmd >> ${record} eval $cmd >> ${record}
cat ${record} cat ${record}
rm ${record} rm ${record}
...@@ -88,8 +88,12 @@ infer_ctc_weight=0 ...@@ -88,8 +88,12 @@ infer_ctc_weight=0
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
epoch_ensemble=0
best_ensemble=1
infer_debug=0 infer_debug=0
infer_score=0 infer_score=0
infer_tag=
infer_parameter=
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy" #infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options # Parsing Options
...@@ -264,11 +268,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -264,11 +268,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir} cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir} cp -f ${pwd_dir}/train.sh ${model_dir}
extra_parameter="${extra_parameter} train_config=basis,${train_config}
--train-config ${pwd_dir}/conf/basis.yaml"
cp -f ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=1 idx=0
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=${pwd_dir}/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
...@@ -278,8 +280,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -278,8 +280,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi fi
cp -f ${config_path} ${model_dir} cp -f ${config_path} ${model_dir}
if [[ $idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter} extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
...@@ -371,20 +378,38 @@ fi ...@@ -371,20 +378,38 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Stage 2: Decoding" echo "Stage 2: Decoding"
dec_models=
if [[ ${n_average} -eq 1 ]]; then
dec_models=${dec_model}
fi
if [[ ${n_average} -ne 1 ]]; then if [[ ${n_average} -ne 1 ]]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt if [[ ${epoch_ensemble} -eq 1 ]]; then
avg_model=avg_epoch${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
if [[ ${best_ensemble} -eq 1 ]]; then
avg_model=avg_best${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir} --inputs ${model_dir}
--num-best-checkpoints ${n_average} --num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}" --output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd [[ $eval -eq 1 ]] && eval $cmd
fi fi
else dec_models+=(${avg_model})
dec_model=${dec_model} fi
fi fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
...@@ -397,6 +422,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -397,6 +422,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
fi fi
for dec_model in ${dec_models[@]}; do
suffix=alpha${len_penalty} suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"` model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str} suffix=${suffix}_${model_str}
...@@ -426,6 +452,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -426,6 +452,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
if [[ ${infer_score} -eq 1 ]]; then if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score suffix=${suffix}_score
fi fi
if [[ -n ${infer_tag} ]]; then
suffix=${suffix}_${infer_tag}
fi
suffix=`echo $suffix | sed -e "s#__#_#"` suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix} result_file=${model_dir}/decode_result_${suffix}
...@@ -487,9 +516,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -487,9 +516,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmd="${cmd} cmd="${cmd}
--score-reference" --score-reference"
fi fi
if [[ -n ${infer_parameters} ]]; then if [[ -n ${infer_parameter} ]]; then
cmd="${cmd} cmd="${cmd}
${infer_parameters}" ${infer_parameter}"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -575,5 +604,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -575,5 +604,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
done done
echo echo
echo "" >> ${result_file}
cat ${result_file} cat ${result_file}
done
fi fi
...@@ -18,7 +18,8 @@ best_checkpoint_metric: bleu ...@@ -18,7 +18,8 @@ best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True maximize_best_checkpoint_metric: True
# no-epoch-checkpoints: True # no-epoch-checkpoints: True
keep-last-epochs: 1 validate-interval: 1
keep-last-epochs: 10
keep-best-checkpoints: 10 keep-best-checkpoints: 10
num-workers: 8 num-workers: 8
......
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
#xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
#inter-xctc-weight: 0.05
#inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
#xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
#xctc-pae-ground-truth-ratio: 0.1
#xctc-pae-ground-truth-only-mistake: True
#pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam-betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
...@@ -23,7 +23,7 @@ if [[ ${tokenizer} -eq 1 ]]; then ...@@ -23,7 +23,7 @@ if [[ ${tokenizer} -eq 1 ]]; then
fi fi
echo "SacreBLEU" >> ${record} echo "SacreBLEU" >> ${record}
cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair}" cmd="cat ${gen} | sacrebleu ${ref} -m bleu -w 4 -l ${lang_pair} | jq -r .score"
eval $cmd >> ${record} eval $cmd >> ${record}
cat ${record} cat ${record}
rm ${record} rm ${record}
exp_name=$1
#./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint10.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint30.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint50.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint70.pt
exit
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint100.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint120.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint140.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint160.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint180.pt
./run.sh --stage 2 --exp_name $exp_name --n_average 1 --test_subset tst-COMMON --dec_model checkpoint200.pt
...@@ -87,9 +87,12 @@ infer_ctc_weight=0 ...@@ -87,9 +87,12 @@ infer_ctc_weight=0
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
epoch_ensemble=0
best_ensemble=1
infer_debug=0 infer_debug=0
infer_score=0 infer_score=0
#infer_parameters=" --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy" infer_tag=
infer_parameter=
#infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy" #infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
# Parsing Options # Parsing Options
...@@ -264,11 +267,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -264,11 +267,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir} cp -f ${pwd_dir}/`basename ${BASH_SOURCE[0]}` ${model_dir}
cp -f ${pwd_dir}/train.sh ${model_dir} cp -f ${pwd_dir}/train.sh ${model_dir}
extra_parameter="${extra_parameter} train_config=basis,${train_config}
--train-config ${pwd_dir}/conf/basis.yaml"
cp -f ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=1 idx=0
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=${pwd_dir}/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
...@@ -278,8 +279,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -278,8 +279,13 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
fi fi
cp -f ${config_path} ${model_dir} cp -f ${config_path} ${model_dir}
if [[ $idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter} extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
...@@ -371,20 +377,38 @@ fi ...@@ -371,20 +377,38 @@ fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "Stage 2: Decoding" echo "Stage 2: Decoding"
dec_models=
if [[ ${n_average} -eq 1 ]]; then
dec_models=${dec_model}
fi
if [[ ${n_average} -ne 1 ]]; then if [[ ${n_average} -ne 1 ]]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt if [[ ${epoch_ensemble} -eq 1 ]]; then
avg_model=avg_epoch${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
dec_models+=(${avg_model})
fi
if [[ ${best_ensemble} -eq 1 ]]; then
avg_model=avg_best${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${avg_model} ]]; then
cmd="python3 ${code_dir}/scripts/average_checkpoints.py cmd="python3 ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir} --inputs ${model_dir}
--num-best-checkpoints ${n_average} --num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}" --output ${model_dir}/${avg_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd [[ $eval -eq 1 ]] && eval $cmd
fi fi
else dec_models+=(${avg_model})
dec_model=${dec_model} fi
fi fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
...@@ -397,6 +421,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -397,6 +421,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
fi fi
for dec_model in ${dec_models[@]}; do
suffix=alpha${len_penalty} suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"` model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str} suffix=${suffix}_${model_str}
...@@ -426,6 +451,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -426,6 +451,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
if [[ ${infer_score} -eq 1 ]]; then if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score suffix=${suffix}_score
fi fi
if [[ -n ${infer_tag} ]]; then
suffix=${suffix}_${infer_tag}
fi
suffix=`echo $suffix | sed -e "s#__#_#"` suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix} result_file=${model_dir}/decode_result_${suffix}
...@@ -486,9 +514,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -486,9 +514,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmd="${cmd} cmd="${cmd}
--score-reference" --score-reference"
fi fi
if [[ -n ${infer_parameters} ]]; then if [[ -n ${infer_parameter} ]]; then
cmd="${cmd} cmd="${cmd}
${infer_parameters}" ${infer_parameter}"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -574,5 +602,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -574,5 +602,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
done done
echo echo
echo "" >> ${result_file}
cat ${result_file} cat ${result_file}
done
fi fi
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论