Commit 2215ade0 by xuchen

Cumulative updates. I mainly optimize the shell scripts and support the new…

Cumulative updates. I mainly optimize the shell scripts and support the new benchmarks. It is more friendly to MT researchers (also including me). I also improve the code. Of course, old problems still remain and new problems arise. Just keep coding.
parent a2353895
...@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048 ...@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1 attention-dropout: 0.1
activation-dropout: 0.1 activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
ctc-weight: 0.3 ctc-weight: 0.3
post-process: sentencepiece post-process: sentencepiece
\ No newline at end of file
...@@ -24,3 +24,6 @@ encoder-attention-heads: 4 ...@@ -24,3 +24,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
#arch: pdss2t_transformer_s arch: pdss2t_transformer_m_8
#arch: s2t_transformer_s
arch: s2t_sate encoder-embed-dim: 512
encoder-embed-dim: 256
pds-stages: 4 pds-stages: 4
#pds-dropout: 0 ctc-layer: 12
pds-layers: 2_2_6_2 pds-layers: 3_3_3_3
pds-ratios: 2_2_2_2 pds-ratios: 2_2_1_2
pds-fusion: True pds-fusion: True
pds-fusion-method: all_conv pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256 pds-embed-dims: 512_512_512_512
pds-ds-method: conv pds-ds-method: conv
pds-embed-norm: True pds-embed-norm: True
pds-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 4_4_4_4 pds-attn-heads: 8_8_8_8
cl-dropout: True
cl-dropout-epoch: 50
train-subset: train-clean-100
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 20
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -45,19 +25,18 @@ lr: 2e-3 ...@@ -45,19 +25,18 @@ lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
ctc-weight: 0.3
label_smoothing: 0.1 label_smoothing: 0.1
conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 8
decoder-embed-dim: 256 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 8
attention-dropout: 0.1
activation-dropout: 0.1 #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
...@@ -44,7 +44,7 @@ use_raw_audio=0 ...@@ -44,7 +44,7 @@ use_raw_audio=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=st specific_prefix=st
specific_dir=${root_dir}/data/mustc/st/en-de specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
...@@ -111,7 +111,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -111,7 +111,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -125,11 +125,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -125,11 +125,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -e ${data_dir} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then feature_zip=fbank80.zip
ln -s ${data_dir}/../fbank80.zip ${data_dir} if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir} ln -s ${data_dir}/../feature_zip ${data_dir}
fi fi
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
...@@ -167,13 +168,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -167,13 +168,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
if [[ ! -f ${data_dir}/../fbank80.zip ]]; then if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/.. mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir} ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi fi
fi fi
......
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-all-embeddings: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -28,5 +28,5 @@ decoder-embed-dim: 512 ...@@ -28,5 +28,5 @@ decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024 decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4 decoder-attention-heads: 4
load-pretrained-encoder-from: #load-pretrained-encoder-from:
load-pretrained-decoder-from: #load-pretrained-decoder-from:
\ No newline at end of file \ No newline at end of file
arch: transformer_iwslt_de_en arch: transformer_iwslt_de_en
share-decoder-input-output-embed: True share-all-embeddings: True
optimizer: adam optimizer: adam
#clip-norm: 10.0 #clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -27,5 +27,5 @@ decoder-embed-dim: 512 ...@@ -27,5 +27,5 @@ decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024 decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4 decoder-attention-heads: 4
load-pretrained-encoder-from: #load-pretrained-encoder-from:
load-pretrained-decoder-from: #load-pretrained-decoder-from:
\ No newline at end of file \ No newline at end of file
#! /bin/bash #! /bin/bash
# Processing MuST-C Datasets # Processing IWSLT2016 De-En Datasets
# Copyright 2021 Natural Language Processing Laboratory # Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com) # Xu Chen (xuchenneu@163.com)
...@@ -43,7 +43,7 @@ tokenizer=1 ...@@ -43,7 +43,7 @@ tokenizer=1
use_specific_dict=0 use_specific_dict=0
specific_prefix=st specific_prefix=st
specific_dir=${root_dir}/data/mustc/st/en-de/ specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st_share
...@@ -78,7 +78,7 @@ beam_size=5 ...@@ -78,7 +78,7 @@ beam_size=5
len_penalty=1.0 len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${specific_prefix}_${exp_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir} mkdir -p ${data_dir}
else else
...@@ -119,7 +119,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -119,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$code_dir/../checkpoints/$dataset/mt/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -332,12 +332,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -332,12 +332,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${code_dir}/scripts/average_checkpoints.py if [[ ! -f ${model_dir}/${dec_model} ]]; then
--inputs ${model_dir} cmd="python ${code_dir}/scripts/average_checkpoints.py
--num-best-checkpoints ${n_average} --inputs ${model_dir}
--output ${model_dir}/${dec_model}" --num-best-checkpoints ${n_average}
echo -e "\033[34mRun command: \n${cmd} \033[0m" --output ${model_dir}/${dec_model}"
[[ $eval -eq 1 ]] && eval $cmd echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else else
dec_model=${dec_model} dec_model=${dec_model}
fi fi
......
arch: s2t_sate arch: multi_ctc_s2t_transformer_s
multi-ctc-layers: 6,8,10,12
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -12,47 +13,18 @@ ctc-weight: 0.3 ...@@ -12,47 +13,18 @@ ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5 conv-kernel-sizes: 5,5
conv-channels: 1024 conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256 encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 2 encoder-layers: 12
text-encoder-layers: 2 decoder-layers: 6
decoder-layers: 2
encoder-attention-heads: 4 encoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer
adapter: shrink
encoder-embed-dim: 256
pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file attention-dropout: 0.1
activation-dropout: 0.1
...@@ -106,7 +106,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -106,7 +106,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
......
...@@ -78,7 +78,7 @@ beam_size=5 ...@@ -78,7 +78,7 @@ beam_size=5
len_penalty=1.0 len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${specific_prefix}_${exp_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir} mkdir -p ${data_dir}
else else
...@@ -113,7 +113,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -113,7 +113,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name} model_dir=${root_dir}/../checkpoints/${dataset}/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
......
...@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
......
ctc-weight: 0.3 ctc-weight: 0.3
post-process: sentencepiece
\ No newline at end of file
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
#arch: pdss2t_transformer_s_16
#arch: pdss2t_transformer_s_32
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_16 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 4 pds-stages: 4
#pds-dropout: 0 ctc-layer: 12
pds-layers: 2_2_6_2 pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pds-fusion: True pds-fusion: True
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_32 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 5 pds-stages: 5
#pds-dropout: 0 ctc-layer: 12
pds-layers: 2_2_3_3_2 pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pds-fusion: True pds-fusion: True
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_8 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 4 pds-stages: 4
#pds-dropout: 0 ctc-layer: 12
pds-layers: 3_3_3_3 pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pds-fusion: True pds-fusion: True
......
...@@ -2,6 +2,9 @@ arch: pdss2t_transformer_m_8 ...@@ -2,6 +2,9 @@ arch: pdss2t_transformer_m_8
#arch: pdss2t_transformer_m_16 #arch: pdss2t_transformer_m_16
#arch: pdss2t_transformer_m_32 #arch: pdss2t_transformer_m_32
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -14,7 +17,7 @@ lr: 2e-3 ...@@ -14,7 +17,7 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.15
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
......
...@@ -27,7 +27,7 @@ lr: 2e-3 ...@@ -27,7 +27,7 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.15
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
......
...@@ -27,7 +27,7 @@ lr: 2e-3 ...@@ -27,7 +27,7 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.15
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
......
...@@ -2,7 +2,6 @@ arch: pdss2t_transformer_m_8 ...@@ -2,7 +2,6 @@ arch: pdss2t_transformer_m_8
encoder-embed-dim: 512 encoder-embed-dim: 512
pds-stages: 4 pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3 pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pds-fusion: True pds-fusion: True
...@@ -27,7 +26,7 @@ lr: 2e-3 ...@@ -27,7 +26,7 @@ lr: 2e-3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.15
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
......
...@@ -2,6 +2,9 @@ arch: pdss2t_transformer_sd_8 ...@@ -2,6 +2,9 @@ arch: pdss2t_transformer_sd_8
#arch: pdss2t_transformer_sd_16 #arch: pdss2t_transformer_sd_16
#arch: pdss2t_transformer_sd_32 #arch: pdss2t_transformer_sd_32
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_16 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 4 pds-stages: 4
#pds-dropout: 0 ctc-layer: 12
pds-layers: 5_5_12_8 pds-layers: 5_5_12_8
pds-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pds-fusion: True pds-fusion: True
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_32 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 5 pds-stages: 5
#pds-dropout: 0 ctc-layer: 12
pds-layers: 5_5_7_7_6 pds-layers: 5_5_7_7_6
pds-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pds-fusion: True pds-fusion: True
......
...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_8 ...@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pds-stages: 4 pds-stages: 4
#pds-dropout: 0 ctc-layer: 12
pds-layers: 7_7_7_9 pds-layers: 7_7_7_9
pds-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pds-fusion: True pds-fusion: True
......
...@@ -44,7 +44,7 @@ specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de ...@@ -44,7 +44,7 @@ specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset} data_dir=${root_dir}/data/${dataset}/asr
test_subset=dev-clean,dev-other,test-clean,test-other test_subset=dev-clean,dev-other,test-clean,test-other
# exp # exp
...@@ -87,7 +87,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -87,7 +87,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -110,7 +110,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -110,7 +110,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang} cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd cmd="$cmd
--asr-prefix ${asr_vocab_prefix}" --asr-prefix ${asr_vocab_prefix}"
fi fi
......
...@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048 ...@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1 attention-dropout: 0.1
activation-dropout: 0.1 activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
...@@ -24,3 +24,6 @@ encoder-attention-heads: 4 ...@@ -24,3 +24,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -37,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -37,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
...@@ -3,13 +3,14 @@ ...@@ -3,13 +3,14 @@
gpu_num=1 gpu_num=1
data_dir= data_dir=
test_subset=(tst-COMMON) test_subset=(dev tst-COMMON)
exp_name= exp_name=
if [ "$#" -eq 1 ]; then if [ "$#" -eq 1 ]; then
exp_name=$1 exp_name=$1
fi fi
cer=0
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
...@@ -22,6 +23,7 @@ cmd="./run.sh ...@@ -22,6 +23,7 @@ cmd="./run.sh
--gpu_num ${gpu_num} --gpu_num ${gpu_num}
--exp_name ${exp_name} --exp_name ${exp_name}
--n_average ${n_average} --n_average ${n_average}
--cer ${cer}
--beam_size ${beam_size} --beam_size ${beam_size}
--len_penalty ${len_penalty} --len_penalty ${len_penalty}
--max_tokens ${max_tokens} --max_tokens ${max_tokens}
......
...@@ -71,6 +71,7 @@ max_tokens=40000 ...@@ -71,6 +71,7 @@ max_tokens=40000
step_valid=0 step_valid=0
# decoding setting # decoding setting
cer=0
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
n_average=10 n_average=10
beam_size=5 beam_size=5
...@@ -106,7 +107,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -106,7 +107,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -339,6 +340,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -339,6 +340,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--wer-lowercase --wer-lowercase
--wer-remove-punct --wer-remove-punct
" "
if [[ ${cer} -eq 1 ]]; then
cmd="${cmd}
--wer-char-level"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
...@@ -346,5 +353,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -346,5 +353,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file} tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi fi
done done
cat ${result_file} cat ${result_file}
fi fi
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-all-embeddings: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -27,3 +27,6 @@ encoder-attention-heads: 8 ...@@ -27,3 +27,6 @@ encoder-attention-heads: 8
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8 decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-all-embeddings: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -27,3 +27,6 @@ encoder-attention-heads: 4 ...@@ -27,3 +27,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -41,7 +41,7 @@ share_dict=1 ...@@ -41,7 +41,7 @@ share_dict=1
lcrm=0 lcrm=0
tokenizer=0 tokenizer=0
use_specific_dict=0 use_specific_dict=1
specific_prefix=st specific_prefix=st
specific_dir=${root_dir}/data/mustc/st specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram10000_st_share
...@@ -78,17 +78,23 @@ beam_size=5 ...@@ -78,17 +78,23 @@ beam_size=5
len_penalty=1.0 len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${specific_prefix}_${exp_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir} mkdir -p ${data_dir}
else else
data_dir=${data_dir}/${vocab_type}${vocab_size} if [[ "${vocab_type}" == "char" ]]; then
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang} vocab_name=${vocab_type}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang} exp_prefix=${exp_prefix}_${vocab_type}
else
vocab_name=${vocab_type}${vocab_size}
fi
data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_type}${vocab_size}_share src_vocab_prefix=spm_${vocab_name}_share
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share tgt_vocab_prefix=spm_${vocab_name}_share
fi fi
fi fi
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
...@@ -113,7 +119,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -113,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -152,7 +158,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -152,7 +158,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{ {
txt_dir=${org_data_dir}/data/${split}/txt if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt
else
txt_dir=${org_data_dir}/data/${split}
fi
cmd="cat ${txt_dir}/${split}.${src_lang}" cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}" cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
...@@ -264,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -264,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $step_valid -eq 1 ]]; then if [[ $step_valid -eq 1 ]]; then
validate_interval=1 validate_interval=1
save_interval=1 save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0 no_epoch_checkpoints=0
save_interval_updates=500 save_interval_updates=500
keep_interval_updates=10 keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi fi
if [[ $bleu_valid -eq 1 ]]; then if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd cmd="$cmd
...@@ -293,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -293,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd="${cmd} cmd="${cmd}
--save-interval $save_interval " --save-interval $save_interval "
fi fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then if [[ -n $save_interval_updates ]]; then
cmd="${cmd} cmd="${cmd}
--save-interval-updates $save_interval_updates" --save-interval-updates $save_interval_updates"
...@@ -374,7 +376,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -374,7 +376,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmd="${cmd} cmd="${cmd}
--scoring sacrebleu" --scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
--tokenizer moses --tokenizer moses
--moses-source-lang ${src_lang} --moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}" --moses-target-lang ${tgt_lang}"
......
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: league
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.15
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: pds
adapter: league
encoder-embed-dim: 512
ctc-layer: 12
pds-stages: 4
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
...@@ -8,7 +8,6 @@ warmup-updates: 10000 ...@@ -8,7 +8,6 @@ warmup-updates: 10000
lr: 2e-3 lr: 2e-3
#adam_betas: (0.9,0.98) #adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
...@@ -18,29 +17,22 @@ conv-kernel-sizes: 5,5 ...@@ -18,29 +17,22 @@ conv-kernel-sizes: 5,5
conv-channels: 1024 conv-channels: 1024
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
text-encoder-layers: 6 text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
#load-pretrained-encoder-from: decoder-embed-dim: 256
#load-pretrained-acoustic-encoder-from: decoder-ffn-embed-dim: 2048
#load-pretrained-text-encoder-from: decoder-attention-heads: 4
#load-pretrained-decoder-from:
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league adapter: league
encoder-embed-dim: 256 encoder-embed-dim: 256
ctc-layer: 12
pds-stages: 4 pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3 pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pds-fusion: True pds-fusion: True
...@@ -53,6 +45,7 @@ pds-kernel-sizes: 5_5_5_5 ...@@ -53,6 +45,7 @@ pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
decoder-embed-dim: 256 #load-pretrained-encoder-from:
decoder-ffn-embed-dim: 2048 #load-pretrained-acoustic-encoder-from:
decoder-attention-heads: 4 #load-pretrained-text-encoder-from:
\ No newline at end of file #load-pretrained-decoder-from:
\ No newline at end of file
...@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name} model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
......
max-epoch: 100
max-update: 400000
best-checkpoint-metric: loss
maximize-best-checkpoint-metric: False
save-interval: 1
no-epoch-checkpoints: True
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
arch: wav2vec
min-lr: 1e-06
stop-min-lr: 1e-09
optimizer: adam
lr: 0.005
lr-scheduler: cosine
warmup-updates: 1000
warmup-init-lr: 1e-07
criterion: wav2vec
num-negatives: 10
cross-sample-negatives: 0
max-sample-size: 150000
max-tokens: 300000
update-freq: 1
conv-feature-layers: (512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)
conv-aggregator-layers: (512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)
skip-connections-agg: True
residual-scale: 0.5
log-compression: True
activation: gelu
offset: auto
log-keys: "prob_perplexity","code_perplexity","temp"
vq-type: gumbel
#vq-type: kmeans
#loss-weights: 1
vq-groups: 2
vq-depth: 2
combine-groups: True
vq-vars: 320
vq-temp: (2,0.5,0.999995)
prediction-steps: 12
arch: wav2vec
min-lr: 1e-06
stop-min-lr: 1e-09
optimizer: adam
lr: 0.005
lr-scheduler: cosine
warmup-updates: 500
warmup-init-lr: 1e-07
criterion: wav2vec
num-negatives: 10
conv-feature-layers: (512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)
conv-aggregator-layers: (512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)
skip-connections-agg: True
residual-scale: 0.5
log-compression: True
max-sample-size: 150000
max-tokens: 1500000
\ No newline at end of file
# @package _group_
common:
fp16: true
log_format: json
log_interval: 200
checkpoint:
save_interval_updates: 25000
keep_interval_updates: 1
no_epoch_checkpoints: true
task:
_name: audio_pretraining
data: ???
max_sample_size: 250000
min_sample_size: 32000
normalize: false
dataset:
num_workers: 6
max_tokens: 1400000
skip_invalid_size_inputs_valid_test: true
distributed_training:
distributed_world_size: 64
ddp_backend: legacy_ddp
criterion:
_name: wav2vec
infonce: true
log_keys: ["prob_perplexity","code_perplexity","temp"]
loss_weights: [0.1, 10]
optimization:
max_update: 400000
lr: [0.0005]
optimizer:
_name: adam
adam_betas: (0.9,0.98)
adam_eps: 1e-06
weight_decay: 0.01
lr_scheduler:
_name: polynomial_decay
warmup_updates: 32000
model:
_name: wav2vec2
quantize_targets: true
final_dim: 256
encoder_layerdrop: 0.05
dropout_input: 0.1
dropout_features: 0.1
feature_grad_mult: 0.1
encoder_embed_dim: 768
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(dev-clean dev-other test-clean test-other)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Pre-training wav2vec systems based on the LibriSpeech Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
device=()
gpu_num=8
update_freq=1
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
lang=${src_lang}
dataset=librispeech
task=audio_pretraining
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/wav2vec
test_subset=dev-clean,dev-other,test-clean,test-other
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
data_config=config.yaml
# training setting
fp16=1
max_tokens=40000
step_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=${root_dir}/checkpoints/${dataset}/wav2vec/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize fairseq recipes in most cases.
echo "stage 0: Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
cmd="python ${code_dir}/examples/wav2vec/wav2vec_manifest.py
${org_data_dir}/LibriSpeech
--dest ${data_dir}
--ext flac"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Audio Pre-training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=1
for config in ${config_list[@]}
do
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
idx=$((idx + 1))
done
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--task ${task}
--max-tokens ${max_tokens}
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ASR Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring wer"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=1
update_freq=1
max_tokens=1500000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
#exp_tag=
config_list=(wav2vec)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test
vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
src_vocab_prefix=spm_unigram32000_share
tgt_vocab_prefix=spm_unigram32000_share
src_lang=en
tgt_lang=de
tokenize=1
splits=(newstest2014 newstest2016)
for split in ${splits[@]}; do
src_file=${data_dir}/${split}.${src_lang}
tgt_file=${data_dir}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.tok
tgt_file=${tgt_file}.tok
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${data_dir}/final
cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${data_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${data_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-all-embeddings: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -11,22 +11,22 @@ adam_betas: (0.9,0.997) ...@@ -11,22 +11,22 @@ adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.3 dropout: 0.1
attention-dropout: 0.0 attention-dropout: 0.1
activation-dropout: 0.0 activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 512 encoder-embed-dim: 512
encoder-ffn-embed-dim: 1024 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 8
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 8
load-pretrained-encoder-from: #load-pretrained-encoder-from:
load-pretrained-decoder-from: #load-pretrained-decoder-from:
\ No newline at end of file \ No newline at end of file
arch: transformer
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 7e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 20
max-update: 100000
patience: 5
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 5
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
arch: transformer_wmt_en_de_big_t2t
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 7e-4
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 1024
encoder-ffn-embed-dim: 4096
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 16
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer_wmt_en_de_big
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 5e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 1024
encoder-ffn-embed-dim: 4096
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 16
decoder-embed-dim: 1024
decoder-ffn-embed-dim: 4096
decoder-attention-heads: 16
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 30
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
#encoder-attention-type: rel_selfattn
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=0
n_average=5
beam_size=4
len_penalty=0.6
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
fi
echo $cmd
eval $cmd
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
line = line.replace(" ", "")
print(line)
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
my $lowercase = 0;
if ($ARGV[0] eq "-lc") {
$lowercase = 1;
shift;
}
my $stem = $ARGV[0];
if (!defined $stem) {
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
exit(1);
}
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
my @REF;
my $ref=0;
while(-e "$stem$ref") {
&add_to_ref("$stem$ref",\@REF);
$ref++;
}
&add_to_ref($stem,\@REF) if -e $stem;
die("ERROR: could not find reference file $stem") unless scalar @REF;
# add additional references explicitly specified on the command line
shift;
foreach my $stem (@ARGV) {
&add_to_ref($stem,\@REF) if -e $stem;
}
sub add_to_ref {
my ($file,$REF) = @_;
my $s=0;
if ($file =~ /.gz$/) {
open(REF,"gzip -dc $file|") or die "Can't read $file";
} else {
open(REF,$file) or die "Can't read $file";
}
while(<REF>) {
chop;
push @{$$REF[$s++]}, $_;
}
close(REF);
}
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
my $s=0;
while(<STDIN>) {
chop;
$_ = lc if $lowercase;
my @WORD = split;
my %REF_NGRAM = ();
my $length_translation_this_sentence = scalar(@WORD);
my ($closest_diff,$closest_length) = (9999,9999);
foreach my $reference (@{$REF[$s]}) {
# print "$s $_ <=> $reference\n";
$reference = lc($reference) if $lowercase;
my @WORD = split(' ',$reference);
my $length = scalar(@WORD);
my $diff = abs($length_translation_this_sentence-$length);
if ($diff < $closest_diff) {
$closest_diff = $diff;
$closest_length = $length;
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
} elsif ($diff == $closest_diff) {
$closest_length = $length if $length < $closest_length;
# from two references with the same closeness to me
# take the *shorter* into account, not the "first" one.
}
for(my $n=1;$n<=4;$n++) {
my %REF_NGRAM_N = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$REF_NGRAM_N{$ngram}++;
}
foreach my $ngram (keys %REF_NGRAM_N) {
if (!defined($REF_NGRAM{$ngram}) ||
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
}
}
}
}
$length_translation += $length_translation_this_sentence;
$length_reference += $closest_length;
for(my $n=1;$n<=4;$n++) {
my %T_NGRAM = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$T_NGRAM{$ngram}++;
}
foreach my $ngram (keys %T_NGRAM) {
$ngram =~ /^(\d+) /;
my $n = $1;
# my $corr = 0;
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
$TOTAL[$n] += $T_NGRAM{$ngram};
if (defined($REF_NGRAM{$ngram})) {
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
$CORRECT[$n] += $T_NGRAM{$ngram};
# $corr = $T_NGRAM{$ngram};
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
}
else {
$CORRECT[$n] += $REF_NGRAM{$ngram};
# $corr = $REF_NGRAM{$ngram};
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
}
}
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
}
}
$s++;
}
my $brevity_penalty = 1;
my $bleu = 0;
my @bleu=();
for(my $n=1;$n<=4;$n++) {
if (defined ($TOTAL[$n])){
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
}else{
$bleu[$n]=0;
}
}
if ($length_reference==0){
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
exit(1);
}
if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
}
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
my_log( $bleu[2] ) +
my_log( $bleu[3] ) +
my_log( $bleu[4] ) ) / 4) ;
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
100*$bleu,
100*$bleu[1],
100*$bleu[2],
100*$bleu[3],
100*$bleu[4],
$brevity_penalty,
$length_translation / $length_reference,
$length_translation,
$length_reference;
sub my_log {
return -9999999999 unless $_[0];
return log($_[0]);
}
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
while(<STDIN>) {
s/,/,/g;
s/。 */. /g;
s/、/,/g;
s/”/"/g;
s/“/"/g;
s/∶/:/g;
s/:/:/g;
s/?/\?/g;
s/《/"/g;
s/》/"/g;
s/)/\)/g;
s/!/\!/g;
s/(/\(/g;
s/;/;/g;
s/1/"/g;
s/」/"/g;
s/「/"/g;
s/0/0/g;
s/3/3/g;
s/2/2/g;
s/5/5/g;
s/6/6/g;
s/9/9/g;
s/7/7/g;
s/8/8/g;
s/4/4/g;
s/. */. /g;
s/~/\~/g;
s/’/\'/g;
s/…/\.\.\./g;
s/━/\-/g;
s/〈/\</g;
s/〉/\>/g;
s/【/\[/g;
s/】/\]/g;
s/%/\%/g;
print $_;
}
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# calculate wmt14 en-de multi-bleu score
if [ $# -ne 1 ]; then
echo "usage: $0 GENERATE_PY_OUTPUT"
exit 1
fi
echo -e "\n RUN >> "$0
requirement_scripts=(detokenizer.perl replace-unicode-punctuation.perl tokenizer.perl multi-bleu.perl)
for script in ${requirement_scripts[@]}; do
if ! which ${script} > /dev/null; then
echo "Error: it seems that moses is not installed or exported int the environment variables." >&2
return 1
fi
done
detokenizer=detokenizer.perl
replace_unicode_punctuation=replace-unicode-punctuation.perl
tokenizer=tokenizer.perl
multi_bleu=multi-bleu.perl
GEN=$1
SYS=$GEN.sys
REF=$GEN.ref
cat $GEN | cut -f 3 > $REF
cat $GEN | cut -f 4 > $SYS
#detokenize the decodes file to format the manner to do tokenize
perl $detokenizer -l de < $SYS > $SYS.dtk
perl $detokenizer -l de < $REF > $REF.dtk
#replace unicode
perl $replace_unicode_punctuation -l de < $SYS.dtk > $SYS.dtk.punc
perl $replace_unicode_punctuation -l de < $REF.dtk > $REF.dtk.punc
#tokenize the decodes file by moses tokenizer.perl
perl $tokenizer -l de < $SYS.dtk.punc > $SYS.dtk.punc.tok
perl $tokenizer -l de < $REF.dtk.punc > $REF.dtk.punc.tok
#"rich-text format" --> rich ##AT##-##AT## text format.
perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $SYS.dtk.punc.tok > $SYS.dtk.punc.tok.atat
perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $REF.dtk.punc.tok > $REF.dtk.punc.tok.atat
perl $multi_bleu $REF.dtk.punc.tok.atat < $SYS.dtk.punc.tok.atat
rm -f $SYS.dtk $SYS.dtk.punc $SYS.dtk.punc.tok $REF.dtk $REF.dtk.punc $REF.dtk.punc.tok
\ No newline at end of file
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
...@@ -95,7 +95,7 @@ class AudioDataset(Dataset): ...@@ -95,7 +95,7 @@ class AudioDataset(Dataset):
continue continue
txt_path = txt_root / f"{split}.{_lang}" txt_path = txt_root / f"{split}.{_lang}"
if tokenizer: if tokenizer:
txt_path = txt_root / f"{split}.{_lang}.tok" txt_path = txt_root / f"{split}.tok.{_lang}"
if Path.exists(txt_path): if Path.exists(txt_path):
if _lang == src_lang: if _lang == src_lang:
......
...@@ -32,14 +32,18 @@ class MTDataset(Dataset): ...@@ -32,14 +32,18 @@ class MTDataset(Dataset):
utterance_id utterance_id
""" """
def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None: def __init__(self, root: str, src_lang, tgt_lang: str, split: str, tokenizer: bool = False) -> None:
_root = Path(root) / "data" / split _root = Path(root) / "data" / split
txt_root = _root / "txt" if (_root / "txt").is_dir() else _root txt_root = _root / "txt" if (_root / "txt").is_dir() else _root
assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root) assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
# Load source and target text # Load source and target text
self.data = [] self.data = []
for _lang in [src_lang, tgt_lang]: for _lang in [src_lang, tgt_lang]:
with open(txt_root / f"{split}.{_lang}") as f: txt_path = txt_root / f"{split}.{_lang}"
if tokenizer:
txt_path = txt_root / f"{split}.tok.{_lang}"
with open(txt_path) as f:
texts = [r.strip() for r in f] texts = [r.strip() for r in f]
self.data.append(texts) self.data.append(texts)
self.data = list(zip(self.data[0], self.data[1])) self.data = list(zip(self.data[0], self.data[1]))
...@@ -72,7 +76,7 @@ def process(args): ...@@ -72,7 +76,7 @@ def process(args):
is_train_split = split.startswith("train") is_train_split = split.startswith("train")
manifest = {c: [] for c in MANIFEST_COLUMNS} manifest = {c: [] for c in MANIFEST_COLUMNS}
dataset = MTDataset(args.data_root, src_lang, tgt_lang, split) dataset = MTDataset(args.data_root, src_lang, tgt_lang, split, args.tokenizer)
for src_text, tgt_text in tqdm(dataset): for src_text, tgt_text in tqdm(dataset):
if args.lowercase_src: if args.lowercase_src:
src_text = src_text.lower() src_text = src_text.lower()
...@@ -165,6 +169,7 @@ def main(): ...@@ -165,6 +169,7 @@ def main():
parser.add_argument("--src-lang", required=True, type=str) parser.add_argument("--src-lang", required=True, type=str)
parser.add_argument("--tgt-lang", required=True, type=str) parser.add_argument("--tgt-lang", required=True, type=str)
parser.add_argument("--share", action="store_true", help="share the source and target vocabulary") parser.add_argument("--share", action="store_true", help="share the source and target vocabulary")
parser.add_argument("--tokenizer", action="store_true", help="use tokenizer txt")
args = parser.parse_args() args = parser.parse_args()
process(args) process(args)
......
...@@ -101,12 +101,6 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -101,12 +101,6 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
def compute_ctc_loss(self, model, sample, encoder_out, logging_output): def compute_ctc_loss(self, model, sample, encoder_out, logging_output):
transcript = sample["transcript"] transcript = sample["transcript"]
ctc_logit = encoder_out["ctc_logit"][0]
lprobs = model.get_normalized_probs(
[ctc_logit], log_probs=True
).contiguous() # (T, B, C) from the encoder
lprobs.batch_first = False
if "ctc_padding_mask" in encoder_out: if "ctc_padding_mask" in encoder_out:
non_padding_mask = ~encoder_out["ctc_padding_mask"][0] non_padding_mask = ~encoder_out["ctc_padding_mask"][0]
else: else:
...@@ -119,15 +113,27 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -119,15 +113,27 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
targets_flat = transcript["tokens"].masked_select(pad_mask) targets_flat = transcript["tokens"].masked_select(pad_mask)
transcript_lengths = pad_mask.sum(-1) transcript_lengths = pad_mask.sum(-1)
with torch.backends.cudnn.flags(enabled=False): ctc_loss = 0
loss = self.ctc_loss( ctc_num = len(encoder_out["ctc_logit"])
lprobs, assert ctc_num != 0, "No ctc logit for loss!"
targets_flat, for i in range(ctc_num):
input_lengths,
transcript_lengths, ctc_logit = encoder_out["ctc_logit"][0]
) lprobs = model.get_normalized_probs(
[ctc_logit], log_probs=True
logging_output["ctc_loss"] = utils.item(loss.data) ).contiguous() # (T, B, C) from the encoder
lprobs.batch_first = False
with torch.backends.cudnn.flags(enabled=False):
loss = self.ctc_loss(
lprobs,
targets_flat,
input_lengths,
transcript_lengths,
)
ctc_loss += loss
ctc_loss /= ctc_num
logging_output["ctc_loss"] = utils.item(ctc_loss.data)
if not model.training: if not model.training:
import editdistance import editdistance
...@@ -142,7 +148,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -142,7 +148,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
wv_errs = 0 wv_errs = 0
for lp, t, inp_l in zip( for lp, t, inp_l in zip(
lprobs_t, lprobs_t,
sample["target_label"] if "target_label" in sample else sample["target"], sample["transcript"]["tokens"] if "transcript" in sample else sample["target"],
input_lengths, input_lengths,
): ):
lp = lp[:inp_l].unsqueeze(0) lp = lp[:inp_l].unsqueeze(0)
...@@ -183,7 +189,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -183,7 +189,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
logging_output["c_errors"] = c_err logging_output["c_errors"] = c_err
logging_output["c_total"] = c_len logging_output["c_total"] = c_len
return loss, logging_output return ctc_loss, logging_output
@staticmethod @staticmethod
def reduce_metrics(logging_outputs) -> None: def reduce_metrics(logging_outputs) -> None:
......
...@@ -7,6 +7,7 @@ from .berard import * # noqa ...@@ -7,6 +7,7 @@ from .berard import * # noqa
from .ctc import * # noqa from .ctc import * # noqa
from .convtransformer import * # noqa from .convtransformer import * # noqa
from .s2t_transformer import * # noqa from .s2t_transformer import * # noqa
from .multi_ctc_s2t_transformer import * # noqa
from .s2t_conformer import * # noqa from .s2t_conformer import * # noqa
from .pdss2t_transformer import * # noqa from .pdss2t_transformer import * # noqa
from .s2t_sate import * # noqa from .s2t_sate import * # noqa
...@@ -317,17 +317,22 @@ class PDSS2TTransformerModel(S2TTransformerModel): ...@@ -317,17 +317,22 @@ class PDSS2TTransformerModel(S2TTransformerModel):
action='store_true', action='store_true',
help="use dlcl encoder", help="use dlcl encoder",
) )
parser.add_argument( parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
'--encoder-history-type', help='how to init the learned weight matrix')
default="learnable_dense", parser.add_argument('--weight-type', type=str, default='scalar',
help='encoder layer history type' help='type of learned weight [scalar, scalar_n(n>1), vector]')
) parser.add_argument('--encoder-learnable', type=eval, default='True',
parser.add_argument( help='enable to learn weights for encoder')
'--decoder-history-type', parser.add_argument('--decoder-learnable', type=eval, default='True',
default="learnable_dense", help='enable to learn weights for decoder')
help='decoder layer history type' parser.add_argument('--normalize-learned-weight', type=eval, default='False',
) help='normalize learned weight by softmax')
parser.add_argument('--normalize-embedding', type=eval, default='False',
help='normalize the input of embedding')
parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
help='dropout for history output')
parser.add_argument('--history-window-size', type=int, default='-1',
help='how many past layers are considered. -1 means all')
# local modeling # local modeling
parser.add_argument( parser.add_argument(
'--hard-mask-window', '--hard-mask-window',
...@@ -375,7 +380,14 @@ class PDSS2TTransformerModel(S2TTransformerModel): ...@@ -375,7 +380,14 @@ class PDSS2TTransformerModel(S2TTransformerModel):
"The legacy relative positional encoding will be deprecated in the future." "The legacy relative positional encoding will be deprecated in the future."
"More Details can be found in https://github.com/espnet/espnet/pull/2816.", "More Details can be found in https://github.com/espnet/espnet/pull/2816.",
) )
# CNN module # CTC
parser.add_argument(
"--ctc-layer",
default=0,
type=int,
help="the position of the ctc loss",
)
# Conformer module
parser.add_argument( parser.add_argument(
"--use-cnn-module", "--use-cnn-module",
default=False, default=False,
...@@ -463,11 +475,6 @@ class PDSS2TTransformerModel(S2TTransformerModel): ...@@ -463,11 +475,6 @@ class PDSS2TTransformerModel(S2TTransformerModel):
type=float, type=float,
help="dropout in each stage", help="dropout in each stage",
) )
parser.add_argument(
"--ctc-layer",
type=int,
help="the layer of ctc",
)
pass pass
@classmethod @classmethod
......
...@@ -299,7 +299,7 @@ class S2TConformerModel(S2TTransformerModel): ...@@ -299,7 +299,7 @@ class S2TConformerModel(S2TTransformerModel):
class S2TConformerEncoder(S2TTransformerEncoder): class S2TConformerEncoder(S2TTransformerEncoder):
"""Speech-to-text Conformer encoder that consists of input subsampler and """Speech-to-text Conformer encoder that consists of input subsampler and
Transformer encoder.""" Conformer encoder."""
def __init__(self, args, task=None, embed_tokens=None): def __init__(self, args, task=None, embed_tokens=None):
super().__init__(args, task, embed_tokens) super().__init__(args, task, embed_tokens)
......
...@@ -16,7 +16,6 @@ from fairseq.models.transformer import Embedding, TransformerDecoder ...@@ -16,7 +16,6 @@ from fairseq.models.transformer import Embedding, TransformerDecoder
from fairseq.models.speech_to_text import ( from fairseq.models.speech_to_text import (
S2TTransformerModel, S2TTransformerModel,
S2TTransformerEncoder, S2TTransformerEncoder,
S2TConformerEncoder,
PDSS2TTransformerModel, PDSS2TTransformerModel,
PDSS2TTransformerEncoder, PDSS2TTransformerEncoder,
CTCCompressStrategy CTCCompressStrategy
...@@ -27,7 +26,7 @@ from fairseq.modules import ( ...@@ -27,7 +26,7 @@ from fairseq.modules import (
LayerNorm, LayerNorm,
PositionalEmbedding, PositionalEmbedding,
TransformerEncoderLayer, TransformerEncoderLayer,
LearnableDenseLayerHistory DynamicLinearCombination
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -287,7 +286,7 @@ class TextEncoder(FairseqEncoder): ...@@ -287,7 +286,7 @@ class TextEncoder(FairseqEncoder):
x = history.pop() x = history.pop()
x = layer(x, encoder_padding_mask, pos_emb=positions) x = layer(x, encoder_padding_mask, pos_emb=positions)
if history is not None: if history is not None:
history.add(x) history.push(x)
if history is not None: if history is not None:
x = history.pop() x = history.pop()
...@@ -331,9 +330,7 @@ class S2TSATEEncoder(FairseqEncoder): ...@@ -331,9 +330,7 @@ class S2TSATEEncoder(FairseqEncoder):
if getattr(args, "use_enc_dlcl", False): if getattr(args, "use_enc_dlcl", False):
layer_num = args.encoder_layers + args.text_encoder_layers + 1 layer_num = args.encoder_layers + args.text_encoder_layers + 1
self.history = LearnableDenseLayerHistory( self.history = DynamicLinearCombination(args, is_encoder=True, layer_num=layer_num)
args.encoder_normalize_before, layer_num, args.encoder_embed_dim, True
)
else: else:
self.history = None self.history = None
...@@ -496,8 +493,8 @@ def base_architecture(args): ...@@ -496,8 +493,8 @@ def base_architecture(args):
args.ctc_layer = getattr(args, "ctc_layer", 0) args.ctc_layer = getattr(args, "ctc_layer", 0)
args.pds_dropout = getattr(args, "pds_dropout", args.dropout) args.pds_dropout = getattr(args, "pds_dropout", args.dropout)
args.fusion = getattr(args, "fusion", False) args.pds_fusion = getattr(args, "pds_fusion", False)
args.fusion_method = getattr(args, "fusion_method", "all_conv") args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
@register_model_architecture("s2t_sate", "s2t_sate_s") @register_model_architecture("s2t_sate", "s2t_sate_s")
......
...@@ -21,11 +21,10 @@ from fairseq.modules import ( ...@@ -21,11 +21,10 @@ from fairseq.modules import (
PositionalEmbedding, PositionalEmbedding,
TransformerEncoderLayer, TransformerEncoderLayer,
ConformerEncoderLayer, ConformerEncoderLayer,
CreateLayerHistory, DynamicLinearCombination,
) )
from torch import Tensor from torch import Tensor
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -42,11 +41,11 @@ class Conv1dSubsampler(nn.Module): ...@@ -42,11 +41,11 @@ class Conv1dSubsampler(nn.Module):
""" """
def __init__( def __init__(
self, self,
in_channels: int, in_channels: int,
mid_channels: int, mid_channels: int,
out_channels: int, out_channels: int,
kernel_sizes: List[int] = (3, 3), kernel_sizes: List[int] = (3, 3),
): ):
super(Conv1dSubsampler, self).__init__() super(Conv1dSubsampler, self).__init__()
self.n_layers = len(kernel_sizes) self.n_layers = len(kernel_sizes)
...@@ -277,16 +276,22 @@ class S2TTransformerModel(FairseqEncoderDecoderModel): ...@@ -277,16 +276,22 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
action='store_true', action='store_true',
help="use dlcl encoder", help="use dlcl encoder",
) )
parser.add_argument( parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
'--encoder-history-type', help='how to init the learned weight matrix')
default="learnable_dense", parser.add_argument('--weight-type', type=str, default='scalar',
help='encoder layer history type' help='type of learned weight [scalar, scalar_n(n>1), vector]')
) parser.add_argument('--encoder-learnable', type=eval, default='True',
parser.add_argument( help='enable to learn weights for encoder')
'--decoder-history-type', parser.add_argument('--decoder-learnable', type=eval, default='True',
default="learnable_dense", help='enable to learn weights for decoder')
help='decoder layer history type' parser.add_argument('--normalize-learned-weight', type=eval, default='False',
) help='normalize learned weight by softmax')
parser.add_argument('--normalize-embedding', type=eval, default='False',
help='normalize the input of embedding')
parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
help='dropout for history output')
parser.add_argument('--history-window-size', type=int, default='-1',
help='how many past layers are considered. -1 means all')
# CTC # CTC
parser.add_argument( parser.add_argument(
"--ctc-layer", "--ctc-layer",
...@@ -444,10 +449,10 @@ class S2TTransformerModel(FairseqEncoderDecoderModel): ...@@ -444,10 +449,10 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
return cls(encoder, decoder) return cls(encoder, decoder)
def get_normalized_probs( def get_normalized_probs(
self, self,
net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
log_probs: bool, log_probs: bool,
sample: Optional[Dict[str, Tensor]] = None, sample: Optional[Dict[str, Tensor]] = None,
): ):
# net_output['encoder_out'] is a (B, T, D) tensor # net_output['encoder_out'] is a (B, T, D) tensor
lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample) lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
...@@ -503,8 +508,8 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -503,8 +508,8 @@ class S2TTransformerEncoder(FairseqEncoder):
else: else:
self.layer_norm = None self.layer_norm = None
if getattr(args, "use_enc_dlcl", False): if args.use_enc_dlcl:
self.history = CreateLayerHistory(args, is_encoder=True) self.history = DynamicLinearCombination(args, is_encoder=True)
else: else:
self.history = None self.history = None
...@@ -588,7 +593,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -588,7 +593,7 @@ class S2TTransformerEncoder(FairseqEncoder):
# add emb into history # add emb into history
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
# gather cosine similarity # gather cosine similarity
cos_sim_idx = (cos_sim_idx + 10) // 10 * 10 - 1 cos_sim_idx = (cos_sim_idx + 10) // 10 * 10 - 1
...@@ -618,7 +623,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -618,7 +623,7 @@ class S2TTransformerEncoder(FairseqEncoder):
self.add_to_dict(x, dis, cos_sim_idx) self.add_to_dict(x, dis, cos_sim_idx)
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
if self.history is not None: if self.history is not None:
x = self.history.pop() x = self.history.pop()
...@@ -631,7 +636,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -631,7 +636,7 @@ class S2TTransformerEncoder(FairseqEncoder):
return { return {
"encoder_out": [x], # T x B x C "encoder_out": [x], # T x B x C
"ctc_logit": [] if ctc_logit is None else [ctc_logit], # B x T x C "ctc_logit": [] if ctc_logit is None else [ctc_logit], # B x T x C
"encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_padding_mask": [encoder_padding_mask], # B x T
"encoder_embedding": [], # B x T x C "encoder_embedding": [], # B x T x C
"encoder_states": [], # List[T x B x C] "encoder_states": [], # List[T x B x C]
...@@ -678,13 +683,13 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -678,13 +683,13 @@ class S2TTransformerEncoder(FairseqEncoder):
class TransformerDecoderScriptable(TransformerDecoder): class TransformerDecoderScriptable(TransformerDecoder):
def extract_features( def extract_features(
self, self,
prev_output_tokens, prev_output_tokens,
encoder_out: Optional[Dict[str, List[Tensor]]] = None, encoder_out: Optional[Dict[str, List[Tensor]]] = None,
incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
full_context_alignment: bool = False, full_context_alignment: bool = False,
alignment_layer: Optional[int] = None, alignment_layer: Optional[int] = None,
alignment_heads: Optional[int] = None, alignment_heads: Optional[int] = None,
): ):
# call scriptable method from parent class # call scriptable method from parent class
x, _ = self.extract_features_scriptable( x, _ = self.extract_features_scriptable(
...@@ -698,10 +703,10 @@ class TransformerDecoderScriptable(TransformerDecoder): ...@@ -698,10 +703,10 @@ class TransformerDecoderScriptable(TransformerDecoder):
return x, None return x, None
def get_normalized_probs_scriptable( def get_normalized_probs_scriptable(
self, self,
net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]], net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
log_probs: bool, log_probs: bool,
sample: Optional[Dict[str, Tensor]] = None, sample: Optional[Dict[str, Tensor]] = None,
): ):
"""Get normalized probabilities (or log probs) from a net's output.""" """Get normalized probabilities (or log probs) from a net's output."""
...@@ -777,6 +782,17 @@ def base_architecture(args): ...@@ -777,6 +782,17 @@ def base_architecture(args):
args.use_cnn_module = getattr(args, "use_cnn_module", False) args.use_cnn_module = getattr(args, "use_cnn_module", False)
args.cnn_module_kernel = getattr(args, "cnn_module_kernel", 31) args.cnn_module_kernel = getattr(args, "cnn_module_kernel", 31)
# settings for DLCL
args.use_enc_dlcl = getattr(args, "use_enc_dlcl", False)
args.use_dec_dlcl = getattr(args, "use_dec_dlcl", False)
args.init_value = getattr(args, 'init_value', 'avg')
args.weight_type = getattr(args, 'weight_type', 'scalar')
args.encoder_learnable = getattr(args, 'encoder_learnable', True)
args.decoder_learnable = getattr(args, 'decoder_learnable', True)
args.normalize_embed = getattr(args, 'normalize_embed', False)
args.history_dropout = getattr(args, 'history_dropout', 0.0)
args.history_window_size = getattr(args, 'history_window_size', -1)
# Relative position encoding # Relative position encoding
args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1) args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1) args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)
......
...@@ -27,7 +27,7 @@ from fairseq.modules import ( ...@@ -27,7 +27,7 @@ from fairseq.modules import (
SinusoidalPositionalEmbedding, SinusoidalPositionalEmbedding,
TransformerDecoderLayer, TransformerDecoderLayer,
TransformerEncoderLayer, TransformerEncoderLayer,
CreateLayerHistory DynamicLinearCombination
) )
from fairseq.modules.checkpoint_activations import checkpoint_wrapper from fairseq.modules.checkpoint_activations import checkpoint_wrapper
from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_ from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
...@@ -218,6 +218,7 @@ class TransformerModel(FairseqEncoderDecoderModel): ...@@ -218,6 +218,7 @@ class TransformerModel(FairseqEncoderDecoderModel):
], ],
help="transformer decoder self-attention layer type" help="transformer decoder self-attention layer type"
) )
# DLCL parameters
parser.add_argument( parser.add_argument(
"--use-enc-dlcl", "--use-enc-dlcl",
default=False, default=False,
...@@ -230,16 +231,23 @@ class TransformerModel(FairseqEncoderDecoderModel): ...@@ -230,16 +231,23 @@ class TransformerModel(FairseqEncoderDecoderModel):
action='store_true', action='store_true',
help="use dlcl encoder", help="use dlcl encoder",
) )
parser.add_argument( parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
'--encoder-history-type', help='how to init the learned weight matrix')
default="learnable_dense", parser.add_argument('--weight-type', type=str, default='scalar',
help='encoder layer history type' help='type of learned weight [scalar, scalar_n(n>1), vector]')
) parser.add_argument('--encoder-learnable', type=eval, default='True',
parser.add_argument( help='enable to learn weights for encoder')
'--decoder-history-type', parser.add_argument('--decoder-learnable', type=eval, default='True',
default="learnable_dense", help='enable to learn weights for decoder')
help='decoder layer history type' parser.add_argument('--normalize-learned-weight', type=eval, default='False',
) help='normalize learned weight by softmax')
parser.add_argument('--normalize-embedding', type=eval, default='False',
help='normalize the input of embedding')
parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
help='dropout for history output')
parser.add_argument('--history-window-size', type=int, default='-1',
help='how many past layers are considered. -1 means all')
# relative position representation
parser.add_argument('--max-encoder-relative-length', type=int, default=-1, parser.add_argument('--max-encoder-relative-length', type=int, default=-1,
help='the max encoder relative length') help='the max encoder relative length')
parser.add_argument('--max-decoder-relative-length', type=int, default=-1, parser.add_argument('--max-decoder-relative-length', type=int, default=-1,
...@@ -271,8 +279,16 @@ class TransformerModel(FairseqEncoderDecoderModel): ...@@ -271,8 +279,16 @@ class TransformerModel(FairseqEncoderDecoderModel):
metavar="STR", metavar="STR",
help="freeze the module of the decoder", help="freeze the module of the decoder",
) )
parser.add_argument('--interleave-dropout', default=0, type=float, metavar='D', parser.add_argument('--interleave-dropout', default=0, type=float, metavar='D',
help='interleaved dropout probability') help='interleaved dropout probability')
parser.add_argument(
"--squeeze-excitation",
default=False,
action='store_true',
help="use squeeze and excitation method",
)
# fmt: on # fmt: on
@classmethod @classmethod
...@@ -496,8 +512,8 @@ class TransformerEncoder(FairseqEncoder): ...@@ -496,8 +512,8 @@ class TransformerEncoder(FairseqEncoder):
else: else:
self.layer_norm = None self.layer_norm = None
if getattr(args, "use_enc_dlcl", False): if args.use_enc_dlcl:
self.history = CreateLayerHistory(args, is_encoder=True) self.history = DynamicLinearCombination(args, is_encoder=True)
else: else:
self.history = None self.history = None
...@@ -617,7 +633,7 @@ class TransformerEncoder(FairseqEncoder): ...@@ -617,7 +633,7 @@ class TransformerEncoder(FairseqEncoder):
# add emb into history # add emb into history
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
# encoder layers # encoder layers
for layer in self.layers: for layer in self.layers:
...@@ -632,7 +648,7 @@ class TransformerEncoder(FairseqEncoder): ...@@ -632,7 +648,7 @@ class TransformerEncoder(FairseqEncoder):
encoder_states.append(x) encoder_states.append(x)
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
if self.history is not None: if self.history is not None:
x = self.history.pop() x = self.history.pop()
...@@ -826,8 +842,8 @@ class TransformerDecoder(FairseqIncrementalDecoder): ...@@ -826,8 +842,8 @@ class TransformerDecoder(FairseqIncrementalDecoder):
else: else:
self.layer_norm = None self.layer_norm = None
if getattr(args, "use_dec_dlcl", False): if args.use_dec_dlcl:
self.history = CreateLayerHistory(args, is_encoder=False) self.history = DynamicLinearCombination(args, is_encoder=False)
else: else:
self.history = None self.history = None
...@@ -1010,7 +1026,7 @@ class TransformerDecoder(FairseqIncrementalDecoder): ...@@ -1010,7 +1026,7 @@ class TransformerDecoder(FairseqIncrementalDecoder):
# add emb into history # add emb into history
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
self_attn_padding_mask: Optional[Tensor] = None self_attn_padding_mask: Optional[Tensor] = None
if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any(): if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
...@@ -1051,7 +1067,7 @@ class TransformerDecoder(FairseqIncrementalDecoder): ...@@ -1051,7 +1067,7 @@ class TransformerDecoder(FairseqIncrementalDecoder):
if layer_attn is not None and idx == alignment_layer: if layer_attn is not None and idx == alignment_layer:
attn = layer_attn.float().to(x) attn = layer_attn.float().to(x)
if self.history is not None: if self.history is not None:
self.history.add(x) self.history.push(x)
if self.gather_attn_weight: if self.gather_attn_weight:
if avg_attn is None: if avg_attn is None:
avg_attn = layer_attn avg_attn = layer_attn
...@@ -1265,6 +1281,18 @@ def base_architecture(args): ...@@ -1265,6 +1281,18 @@ def base_architecture(args):
args.encoder_attention_type = getattr(args, "encoder_attention_type", "selfattn") args.encoder_attention_type = getattr(args, "encoder_attention_type", "selfattn")
args.decoder_attention_type = getattr(args, "decoder_attention_type", "selfattn") args.decoder_attention_type = getattr(args, "decoder_attention_type", "selfattn")
# settings for DLCL
args.use_enc_dlcl = getattr(args, "use_enc_dlcl", False)
args.use_dec_dlcl = getattr(args, "use_dec_dlcl", False)
args.init_value = getattr(args, 'init_value', 'avg')
args.weight_type = getattr(args, 'weight_type', 'scalar')
args.encoder_learnable = getattr(args, 'encoder_learnable', True)
args.decoder_learnable = getattr(args, 'decoder_learnable', True)
args.normalize_embed = getattr(args, 'normalize_embed', False)
args.history_dropout = getattr(args, 'history_dropout', 0.0)
args.history_window_size = getattr(args, 'history_window_size', -1)
# settings for RPR
args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1) args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1) args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)
args.k_only = getattr(args, 'k_only', True) args.k_only = getattr(args, 'k_only', True)
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
# LICENSE file in the root directory of this source tree. # LICENSE file in the root directory of this source tree.
"""isort:skip_file""" """isort:skip_file"""
from .squeeze_excitation import SEAttention
from .adaptive_input import AdaptiveInput from .adaptive_input import AdaptiveInput
from .adaptive_softmax import AdaptiveSoftmax from .adaptive_softmax import AdaptiveSoftmax
from .beamable_mm import BeamableMM from .beamable_mm import BeamableMM
...@@ -13,6 +14,7 @@ from .downsample_convolution import DownSampleConvolutionModule ...@@ -13,6 +14,7 @@ from .downsample_convolution import DownSampleConvolutionModule
from .conv_tbc import ConvTBC from .conv_tbc import ConvTBC
from .cross_entropy import cross_entropy from .cross_entropy import cross_entropy
from .downsampled_multihead_attention import DownsampledMultiHeadAttention from .downsampled_multihead_attention import DownsampledMultiHeadAttention
from .dlcl import DynamicLinearCombination
from .dynamic_convolution import DynamicConv, DynamicConv1dTBC from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
from .dynamic_crf_layer import DynamicCRF from .dynamic_crf_layer import DynamicCRF
from .fairseq_dropout import FairseqDropout from .fairseq_dropout import FairseqDropout
...@@ -22,7 +24,6 @@ from .grad_multiply import GradMultiply ...@@ -22,7 +24,6 @@ from .grad_multiply import GradMultiply
from .gumbel_vector_quantizer import GumbelVectorQuantizer from .gumbel_vector_quantizer import GumbelVectorQuantizer
from .kmeans_vector_quantizer import KmeansVectorQuantizer from .kmeans_vector_quantizer import KmeansVectorQuantizer
from .layer_drop import LayerDropModuleList from .layer_drop import LayerDropModuleList
from .layer_history import CreateLayerHistory, LearnableDenseLayerHistory
from .layer_norm import Fp32LayerNorm, LayerNorm from .layer_norm import Fp32LayerNorm, LayerNorm
from .learned_positional_embedding import LearnedPositionalEmbedding from .learned_positional_embedding import LearnedPositionalEmbedding
from .lightweight_convolution import LightweightConv, LightweightConv1dTBC from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
...@@ -46,6 +47,7 @@ from .conformer_layer import ConformerEncoderLayer ...@@ -46,6 +47,7 @@ from .conformer_layer import ConformerEncoderLayer
from .pds_layer import PDSTransformerEncoderLayer from .pds_layer import PDSTransformerEncoderLayer
__all__ = [ __all__ = [
"DynamicLinearCombination",
"AdaptiveInput", "AdaptiveInput",
"AdaptiveSoftmax", "AdaptiveSoftmax",
"BeamableMM", "BeamableMM",
...@@ -53,7 +55,6 @@ __all__ = [ ...@@ -53,7 +55,6 @@ __all__ = [
"ConformerEncoderLayer", "ConformerEncoderLayer",
"ConvolutionModule", "ConvolutionModule",
"ConvTBC", "ConvTBC",
"CreateLayerHistory",
"cross_entropy", "cross_entropy",
"DownSampleConvolutionModule", "DownSampleConvolutionModule",
"DownsampledMultiHeadAttention", "DownsampledMultiHeadAttention",
...@@ -70,7 +71,6 @@ __all__ = [ ...@@ -70,7 +71,6 @@ __all__ = [
"KmeansVectorQuantizer", "KmeansVectorQuantizer",
"LayerDropModuleList", "LayerDropModuleList",
"LayerNorm", "LayerNorm",
"LearnableDenseLayerHistory",
"LearnedPositionalEmbedding", "LearnedPositionalEmbedding",
"LightweightConv1dTBC", "LightweightConv1dTBC",
"LightweightConv", "LightweightConv",
...@@ -84,6 +84,7 @@ __all__ = [ ...@@ -84,6 +84,7 @@ __all__ = [
"RelativeMultiheadAttention", "RelativeMultiheadAttention",
"SamePad", "SamePad",
"ScalarBias", "ScalarBias",
"SEAttention",
"SinusoidalPositionalEmbedding", "SinusoidalPositionalEmbedding",
"TransformerSentenceEncoderLayer", "TransformerSentenceEncoderLayer",
"TransformerSentenceEncoder", "TransformerSentenceEncoder",
......
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class DynamicLinearCombination(nn.Module):
"""Implementation of Dynamic Linear Combination of Layers (DLCL)
for pre-norm, x_{l+1} = \sum_{k=0}^{l}{W_k^{l+1}LN(y_k)}
for post-norm, x_{l+1} = LN(\sum_{k=0}^{l}{W_k^{l+1}y_k})
"""
def __init__(self, args, is_encoder, include_sublayer=False, layer_num=None):
super(DynamicLinearCombination, self).__init__()
self.normalize_learned_weight = args.normalize_learned_weight
self.normalized_weight = None
self.weight_type = args.weight_type
self.out_dropout = args.history_dropout
self.normalize_before = args.encoder_normalize_before if is_encoder else args.decoder_normalize_before
self.dim = args.encoder_embed_dim if is_encoder else args.decoder_embed_dim
# transformer encoder has 2 sub-layers, decoder has 3 sub-layers
if layer_num is None:
if include_sublayer:
layer_num = 1 + (2 * args.encoder_layers if is_encoder else 3 * args.decoder_layers)
else:
layer_num = 1 + (args.encoder_layers if is_encoder else args.decoder_layers)
# init weights and corresponding masks
learnable = args.encoder_learnable if is_encoder else args.decoder_learnable
self.weight, self.weight_mask = self._init(layer_num, args.init_value, args.weight_type,
args.history_window_size, learnable)
# init triangular layer norm
if args.normalize_embed:
self.layer_norms = nn.ModuleList([nn.LayerNorm(self.dim) for _ in range(layer_num)])
else:
self.layer_norms = nn.ModuleList([nn.Sequential()] + [nn.LayerNorm(self.dim) for _ in range(layer_num-1)])
# states
self.count = 0
self.layers = []
@staticmethod
def _init_mask(n_layer, window_size):
mask = np.zeros([n_layer, n_layer], dtype=np.float32)
# all preceding layers
if window_size == -1:
for i in range(mask.shape[0]):
mask[i, :(i+1)] = 1
else:
for i in range(mask.shape[0]):
mask[i, max(0, i + 1 - window_size): (i+1)] = 1
return torch.from_numpy(mask)
@staticmethod
def _init_weight(np_mask, dim=1, init_value='avg', learnable=True):
np_weight = np.copy(np_mask)
if init_value == 'avg':
np_weight = np_weight / np.sum(np_weight, axis=1, keepdims=True)
elif init_value == 'one':
np_weight[:, :] = 1.
else:
raise ValueError('unknown init_value:{}'.format(init_value))
weight_tensor = torch.from_numpy(np_weight).unsqueeze(2)
if dim > 1:
weight_tensor = weight_tensor.repeat(1, 1, dim)
weight_tensor = torch.nn.Parameter(weight_tensor, requires_grad=learnable)
return weight_tensor
def _init(self, layer_num, init_value, weight_type, window_size=-1, learnable=True):
"""
:param layer_num: total layers
:param init_value: initial weight value
:param weight_type: granularity of learned weights (scalar, scalar_X, vector)
:param window_size: past windows size of layers
:param learnable: if allow to learn weights
:return:
weight_tensor:
1. L x L x 1 if weight type='scalar'
2. L x L x X if weight type='scalar_X'
3. L x L x H if weight type='vector'
weight_mask: L x L, 0 means padding
"""
"""
weight shape is:
1. L x L x 1 for weight type='scalar'
2. L x L x X for weight type='scalar_X'
3. L x L x H for weight type='vector'
mask shape is L x L
:return:
"""
# L x L
mask_tensor = self._init_mask(layer_num, window_size)
if weight_type == 'scalar':
self.last_dim = 1
elif weight_type == 'vector':
self.last_dim = self.dim
elif weight_type.startswith('scalar_'):
n = int(weight_type.split('_')[1])
assert self.dim % n == 0
self.last_dim = n
else:
raise ValueError('unknown weight_type:{}'.format(weight_type))
weight_tensor = self._init_weight(mask_tensor.numpy(), self.last_dim, init_value,
learnable=learnable)
return weight_tensor, mask_tensor
def push(self, layer):
self.count += 1
# first layer
if self.count == 1:
self.layers.append(self.layer_norms[0](layer))
# compatible when running on CPU
if layer.is_cuda and not self.weight_mask.is_cuda:
self.weight_mask = self.weight_mask.cuda()
if self.normalize_learned_weight:
weight = self.weight.masked_fill((self.weight_mask == 0).unsqueeze(2), float('-inf'))
self.normalized_weight = F.softmax(weight, dim=1)
return
# following layer
if self.normalize_before:
layer = self.layer_norms[self.count-1](layer)
self.layers.append(layer)
def _pick_weights(self):
weight = self.normalized_weight if self.normalize_learned_weight else self.weight
weight = weight[self.count - 1, : self.count, :].view(-1, 1, 1, self.last_dim)
return weight
def pop(self):
assert len(self.layers) > 0
# D x 1 x 1 x [1, H/G, H]
weights = self._pick_weights()
# D x T x B x H
layers = torch.stack(self.layers, 0)
# linear combination
if self.weight_type in ['scalar', 'vector']:
ret = (layers * weights).sum(0)
else:
D, T, B, H = layers.size()
layers = layers.view(D, T, B, -1, weights.size(-1))
weights = weights.unsqueeze(3)
ret = (layers * weights).sum(0).view(T, B, H)
if self.normalize_before:
if self.out_dropout > 0:
return F.dropout(ret, p=self.out_dropout, training=self.training)
else:
return ret
if self.out_dropout > 0:
return F.dropout(self.layer_norms[self.count-1](ret), p=self.out_dropout, training=self.training)
else:
return self.layer_norms[self.count-1](ret)
def clean(self):
self.count = 0
self.layers = []
def forward(self):
pass
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import torch.nn as nn
class SEAttention(nn.Module):
def __init__(self, channel=512, reduction=16):
super(SEAttention, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool1d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=False),
nn.Sigmoid()
)
self.init_weights()
def init_weights(self):
for m in self.modules():
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def forward(self, x):
x = x.permute(1, 2, 0)
b, c, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1)
x = x * y.expand_as(x)
x = x.permute(2, 0, 1)
return x
...@@ -14,6 +14,7 @@ from fairseq.modules import ( ...@@ -14,6 +14,7 @@ from fairseq.modules import (
RelPositionMultiheadAttention, RelPositionMultiheadAttention,
RelativeMultiheadAttention, RelativeMultiheadAttention,
LocalMultiheadAttention, LocalMultiheadAttention,
SEAttention,
) )
from fairseq.modules.fairseq_dropout import FairseqDropout from fairseq.modules.fairseq_dropout import FairseqDropout
from fairseq.modules.quant_noise import quant_noise from fairseq.modules.quant_noise import quant_noise
...@@ -73,6 +74,10 @@ class TransformerEncoderLayer(nn.Module): ...@@ -73,6 +74,10 @@ class TransformerEncoderLayer(nn.Module):
self.final_layer_norm = LayerNorm(self.embed_dim) self.final_layer_norm = LayerNorm(self.embed_dim)
self.use_se = getattr(args, "squeeze_excitation", False)
if self.use_se:
self.se_attn = SEAttention(self.embed_dim, 16)
def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size): def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
return quant_noise( return quant_noise(
nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
...@@ -211,6 +216,11 @@ class TransformerEncoderLayer(nn.Module): ...@@ -211,6 +216,11 @@ class TransformerEncoderLayer(nn.Module):
x = self.activation_fn(self.fc1(x)) x = self.activation_fn(self.fc1(x))
x = self.activation_dropout_module(x) x = self.activation_dropout_module(x)
x = self.fc2(x) x = self.fc2(x)
# use squeeze-and-excitation method
if self.use_se:
x = self.se_attn(x)
x = self.dropout_module(x) x = self.dropout_module(x)
x = self.residual_connection(x, residual) x = self.residual_connection(x, residual)
if not self.normalize_before: if not self.normalize_before:
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论