Commit da4e7dc3 by xuchen

fix the bugs

parent 093098e4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3 ctc-weight: 0.3
post-process: sentencepiece
\ No newline at end of file
...@@ -15,6 +15,7 @@ report-accuracy: True ...@@ -15,6 +15,7 @@ report-accuracy: True
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
arch: s2t_transformer_s arch: s2t_transformer_s
#arch: pdss2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
......
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
pds-fusion: True
train-subset: train_asr ctc-layer: 12
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_16
train-subset: train_st encoder-embed-dim: 256
valid-subset: dev_st pds-stages: 4
ctc-layer: 12
max-epoch: 100 pds-layers: 2_2_6_2
max-update: 100000 pds-ratios: 2_2_2_2
pds-fusion: True
num-workers: 8 pds-fusion-method: all_conv
patience: 10 pds-embed-dims: 256_256_256_256
no-progress-bar: True pds-ds-method: conv
log-interval: 100 pds-embed-norm: True
seed: 1 pds-position-embed: 1_1_1_1
report-accuracy: True pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
#load-pretrained-encoder-from: pds-attn-heads: 4_4_4_4
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_32
train-subset: train_asr encoder-embed-dim: 256
valid-subset: dev_asr pds-stages: 5
ctc-layer: 12
max-epoch: 100 pds-layers: 2_2_3_3_2
max-update: 100000 pds-ratios: 2_2_2_2_2
pds-fusion: True
num-workers: 8 pds-fusion-method: all_conv
patience: 10 pds-embed-dims: 256_256_256_256_256
no-progress-bar: True pds-ds-method: conv
log-interval: 100 pds-embed-norm: True
seed: 1 pds-position-embed: 1_1_1_1_1
report-accuracy: True pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
train-subset: train_st encoder-embed-dim: 256
valid-subset: dev_st pds-stages: 4
ctc-layer: 12
max-epoch: 100 pds-layers: 3_3_3_3
max-update: 100000 pds-ratios: 2_2_1_2
pds-fusion: True
num-workers: 8 pds-fusion-method: all_conv
patience: 10 pds-embed-dims: 256_256_256_256
no-progress-bar: True pds-ds-method: conv
log-interval: 100 pds-embed-norm: True
seed: 1 pds-position-embed: 1_1_1_1
report-accuracy: True pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
#load-pretrained-encoder-from: pds-attn-heads: 4_4_4_4
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then ...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1 exp_name=$1
fi fi
cer=1
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
...@@ -21,6 +22,7 @@ cmd="./run.sh ...@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2 --stop_stage 2
--gpu_num ${gpu_num} --gpu_num ${gpu_num}
--exp_name ${exp_name} --exp_name ${exp_name}
--cer ${cer}
--n_average ${n_average} --n_average ${n_average}
--beam_size ${beam_size} --beam_size ${beam_size}
--len_penalty ${len_penalty} --len_penalty ${len_penalty}
......
...@@ -24,31 +24,31 @@ stop_stage=0 ...@@ -24,31 +24,31 @@ stop_stage=0
gpu_num=0 gpu_num=0
update_freq=1 update_freq=1
s2t_dir=~/Code/st root_dir=~/st
root_dir=${s2t_dir}/Fairseq-S2T code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
src_lang=en src_lang=zh
tgt_lang=fr lang=${src_lang}
lang=${src_lang}-${tgt_lang}
dataset=libri_trans dataset=aishell
task=speech_to_text task=speech_to_text
vocab_type=unigram vocab_type=unigram
vocab_size=1000 vocab_type=char
speed_perturb=0 vocab_size=5000
lcrm=1 speed_perturb=1
lcrm=0
tokenizer=0 tokenizer=0
use_raw_audio=1 use_raw_audio=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=st specific_prefix=st
specific_dir=${s2t_dir}/data/mustc/st/en-de specific_dir=${root_dir}/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${s2t_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=${s2t_dir}/data/${dataset}/asr data_dir=${root_dir}/data/${dataset}/asr
train_split=train train_split=train
valid_split=dev valid_split=dev
test_split=test test_split=test
...@@ -71,6 +71,7 @@ max_tokens=40000 ...@@ -71,6 +71,7 @@ max_tokens=40000
step_valid=0 step_valid=0
# decoding setting # decoding setting
cer=1
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
n_average=10 n_average=10
beam_size=5 beam_size=5
...@@ -96,6 +97,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then ...@@ -96,6 +97,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw exp_prefix=${exp_prefix}_raw
fi fi
if [[ "${vocab_type}" == "char" ]]; then
data_dir=${data_dir}_char
exp_prefix=${exp_prefix}_char
fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
...@@ -106,7 +111,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -106,7 +111,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name} model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -120,13 +125,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -120,13 +125,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -e ${data_dir} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_audio_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--task asr --task asr
--splits ${train_split},${valid_split},${test_split}
--src-lang ${src_lang} --src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
...@@ -135,7 +146,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -135,7 +146,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--raw" --raw"
fi fi
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang} cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd cmd="$cmd
--asr-prefix ${asr_vocab_prefix}" --asr-prefix ${asr_vocab_prefix}"
fi fi
...@@ -155,6 +166,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -155,6 +166,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...@@ -181,28 +201,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -181,28 +201,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--task ${task} --task ${task}
...@@ -286,12 +304,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -286,12 +304,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py if [[ ! -f ${model_dir}/${dec_model} ]]; then
--inputs ${model_dir} cmd="python ${code_dir}/scripts/average_checkpoints.py
--num-epoch-checkpoints ${n_average} --inputs ${model_dir}
--output ${model_dir}/${dec_model}" --num-best-checkpoints ${n_average}
echo -e "\033[34mRun command: \n${cmd} \033[0m" --output ${model_dir}/${dec_model}"
[[ $eval -eq 1 ]] && eval $cmd echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else else
dec_model=${dec_model} dec_model=${dec_model}
fi fi
...@@ -311,8 +331,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -311,8 +331,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=${test_subset//,/ } test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset}_asr subset=${subset}
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--gen-subset ${subset} --gen-subset ${subset}
...@@ -323,10 +343,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -323,10 +343,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty} --lenpen ${len_penalty}
--scoring wer --scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
" "
if [[ ${cer} -eq 1 ]]; then
cmd="${cmd}
--wer-char-level"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
......
set -e set -e
eval=1 eval=1
lcrm=0
root_dir=~/st/Fairseq-S2T root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test data_dir=/home/xuchen/st/data/wmt/test
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997) ...@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.3
attention-dropout: 0.1 attention-dropout: 0.0
activation-dropout: 0.1 activation-dropout: 0.0
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 512 encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 1024
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 8 encoder-attention-heads: 4
decoder-embed-dim: 512 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 1024
decoder-attention-heads: 8 decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer_iwslt_de_en
share-decoder-input-output-embed: True
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
weight-decay: 0.0001
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 5e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 512
encoder-ffn-embed-dim: 1024
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997) ...@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.3
attention-dropout: 0.1 attention-dropout: 0.0
activation-dropout: 0.1 activation-dropout: 0.0
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 256 encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 1024
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4 decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 50000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
...@@ -2,4 +2,4 @@ ...@@ -2,4 +2,4 @@
encoder-attention-type: relative encoder-attention-type: relative
decoder-attention-type: relative decoder-attention-type: relative
max-encoder-relative-length: 20 max-encoder-relative-length: 20
max-decoder-relative-length: 20 max-decoder-relative-length: 20
\ No newline at end of file
...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then ...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1 exp_name=$1
fi fi
sacrebleu=0
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
...@@ -21,6 +22,7 @@ cmd="./run.sh ...@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2 --stop_stage 2
--gpu_num ${gpu_num} --gpu_num ${gpu_num}
--exp_name ${exp_name} --exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average} --n_average ${n_average}
--beam_size ${beam_size} --beam_size ${beam_size}
--len_penalty ${len_penalty} --len_penalty ${len_penalty}
......
...@@ -13,7 +13,7 @@ set -o pipefail ...@@ -13,7 +13,7 @@ set -o pipefail
export PYTHONIOENCODING=UTF-8 export PYTHONIOENCODING=UTF-8
eval=1 eval=1
time=$(date "+%m%d_%H%M") time=$(date "+%m%d")
stage=0 stage=0
stop_stage=0 stop_stage=0
...@@ -24,33 +24,34 @@ device=() ...@@ -24,33 +24,34 @@ device=()
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
root_dir=~/st/Fairseq-S2T root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
src_lang=en src_lang=de
tgt_lang=de tgt_lang=en
lang=${src_lang}-${tgt_lang} lang=${src_lang}-${tgt_lang}
dataset=mt dataset=iwslt14
task=translation task=translation
vocab_type=unigram vocab_type=unigram
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
lcrm=0 lcrm=0
tokenizer=0 tokenizer=1
use_specific_dict=0 use_specific_dict=0
specific_prefix=st specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/ specific_dir=${root_dir}/data/mustc/st/en-de/
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang} data_dir=${root_dir}/data/${dataset}/mt
train_subset=train train_subset=train
valid_subset=dev valid_subset=dev
trans_subset=tst-COMMON trans_subset=test
test_subset=test test_subset=test
# exp # exp
...@@ -70,6 +71,7 @@ step_valid=0 ...@@ -70,6 +71,7 @@ step_valid=0
bleu_valid=0 bleu_valid=0
# decoding setting # decoding setting
sacrebleu=0
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
n_average=10 n_average=10
beam_size=5 beam_size=5
...@@ -80,13 +82,19 @@ if [[ ${use_specific_dict} -eq 1 ]]; then ...@@ -80,13 +82,19 @@ if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir} mkdir -p ${data_dir}
else else
data_dir=${data_dir}/${vocab_type}${vocab_size} if [[ "${vocab_type}" == "char" ]]; then
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang} vocab_name=${vocab_type}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang} exp_prefix=${exp_prefix}_${vocab_type}
else
vocab_name=${vocab_type}${vocab_size}
fi
data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_type}${vocab_size}_share src_vocab_prefix=spm_${vocab_name}_share
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share tgt_vocab_prefix=spm_${vocab_name}_share
fi fi
fi fi
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
...@@ -111,7 +119,7 @@ if [[ -z ${exp_name} ]]; then ...@@ -111,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name} model_dir=$code_dir/../checkpoints/$dataset/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -127,7 +135,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -127,7 +135,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset} --splits ${train_subset},${valid_subset},${trans_subset}
...@@ -150,9 +158,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -150,9 +158,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{ {
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}" if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt
else
txt_dir=${org_data_dir}/data/${split}
fi
cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}" cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
fi fi
cmd="${cmd} cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
...@@ -165,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -165,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model --model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece --output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${tgt_lang} < ${txt_dir}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}" > ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -174,7 +187,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -174,7 +187,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done done
wait wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py cmd="python ${code_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang} --source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset} --trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset} --validpref ${data_dir}/data/${valid_subset}
...@@ -214,28 +227,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -214,28 +227,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
...@@ -263,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -263,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $step_valid -eq 1 ]]; then if [[ $step_valid -eq 1 ]]; then
validate_interval=1 validate_interval=1
save_interval=1 save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0 no_epoch_checkpoints=0
save_interval_updates=500 save_interval_updates=500
keep_interval_updates=10 keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi fi
if [[ $bleu_valid -eq 1 ]]; then if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd cmd="$cmd
...@@ -292,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -292,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd="${cmd} cmd="${cmd}
--save-interval $save_interval " --save-interval $save_interval "
fi fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then if [[ -n $save_interval_updates ]]; then
cmd="${cmd} cmd="${cmd}
--save-interval-updates $save_interval_updates" --save-interval-updates $save_interval_updates"
...@@ -329,9 +332,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -329,9 +332,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir} --inputs ${model_dir}
--num-epoch-checkpoints ${n_average} --num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}" --output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd [[ $eval -eq 1 ]] && eval $cmd
...@@ -354,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -354,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ }) test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
...@@ -365,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -365,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty} --lenpen ${len_penalty}
--post-process sentencepiece --post-process sentencepiece"
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses --tokenizer moses
--moses-source-lang ${src_lang} --moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}" --moses-target-lang ${tgt_lang}"
fi
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
......
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
set -e
eval=1
lcrm=1
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/test
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
splits=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g')
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task st
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--st-spm-prefix ${st_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
ctc-weight: 0.3
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -eq 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml
else
data_config=config_st.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#config_list=(pds_base)
#config_list=(pds_base conformer)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -39,4 +23,4 @@ encoder-attention-heads: 4 ...@@ -39,4 +23,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
patience: 20
best-checkpoint-metric: loss
maximize-best-checkpoint-metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 20
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_m arch: s2t_transformer_m
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
...@@ -2,19 +2,19 @@ ...@@ -2,19 +2,19 @@
#arch: s2t_transformer_s #arch: s2t_transformer_s
arch: s2t_sate arch: s2t_sate
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 2_2_6_2 pds-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
cl-dropout: True cl-dropout: True
cl-dropout-epoch: 50 cl-dropout-epoch: 50
......
train-subset: train-clean-100
valid-subset: dev-clean
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_ctc_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_ctc_conformer_lr0.001/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_pyramid4_all256_3333_sr8_ctc/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1114_st_pyramid4_all256_ctc_fix/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1015_st_pyramid4_all256_conformer_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_pyramid4_all256_conformer_ctc/avg_10_checkpoint.pt
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: transformer
#acoustic-encoder: conformer
acoustic-encoder: pyramid
adapter: league
#adapter: none
#adapter: context
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2
pyramid-embed-dims: 256_256_256_256
pyramid-fuse: True
pyramid-reduced-embed: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4
\ No newline at end of file
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
...@@ -37,4 +20,4 @@ encoder-attention-heads: 4 ...@@ -37,4 +20,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_16 arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 2_2_6_2 pds-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -52,4 +36,4 @@ encoder-attention-heads: 4 ...@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_32 arch: pdss2t_transformer_s_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 5 pds-stages: 5
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 2_2_3_3_2 pds-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256 pds-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1_1 pds-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5 pds-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8 pds-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4 pds-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -52,4 +36,4 @@ encoder-attention-heads: 4 ...@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 3_3_3_3 pds-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -52,4 +36,4 @@ encoder-attention-heads: 4 ...@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
...@@ -2,22 +2,6 @@ arch: pdss2t_transformer_m_8 ...@@ -2,22 +2,6 @@ arch: pdss2t_transformer_m_8
#arch: pdss2t_transformer_m_16 #arch: pdss2t_transformer_m_16
#arch: pdss2t_transformer_m_32 #arch: pdss2t_transformer_m_32
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
......
arch: pdss2t_transformer_m_16 arch: pdss2t_transformer_m_16
encoder-embed-dim: 512 encoder-embed-dim: 512
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 2_2_6_2 pds-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512 pds-embed-dims: 512_512_512_512
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 4_4_4_4 pds-ffn-ratios: 4_4_4_4
pyramid-attn-heads: 8_8_8_8 pds-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_m_32 arch: pdss2t_transformer_m_32
encoder-embed-dim: 512 encoder-embed-dim: 512
pyramid-stages: 5 pds-stages: 5
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 2_2_3_3_2 pds-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512_512 pds-embed-dims: 512_512_512_512_512
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1_1 pds-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5 pds-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 4_4_4_4_4 pds-ffn-ratios: 4_4_4_4_4
pyramid-attn-heads: 8_8_8_8_8 pds-attn-heads: 8_8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_m_8 arch: pdss2t_transformer_m_8
encoder-embed-dim: 512 encoder-embed-dim: 512
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 3_3_3_3 pds-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512 pds-embed-dims: 512_512_512_512
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 4_4_4_4 pds-ffn-ratios: 4_4_4_4
pyramid-attn-heads: 8_8_8_8 pds-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
...@@ -2,22 +2,6 @@ arch: pdss2t_transformer_sd_8 ...@@ -2,22 +2,6 @@ arch: pdss2t_transformer_sd_8
#arch: pdss2t_transformer_sd_16 #arch: pdss2t_transformer_sd_16
#arch: pdss2t_transformer_sd_32 #arch: pdss2t_transformer_sd_32
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
......
arch: pdss2t_transformer_sd_16 arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 5_5_12_8 pds-layers: 5_5_12_8
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_sd_32 arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 5 pds-stages: 5
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 5_5_7_7_6 pds-layers: 5_5_7_7_6
pyramid-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256 pds-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1_1 pds-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5 pds-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8 pds-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4 pds-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_sd_8 arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 7_7_7_9 pds-layers: 7_7_7_9
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
...@@ -24,7 +24,8 @@ device=() ...@@ -24,7 +24,8 @@ device=()
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
root_dir=~/st/Fairseq-S2T root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
...@@ -42,8 +43,8 @@ specific_prefix=valid ...@@ -42,8 +43,8 @@ specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=~/st/data/${dataset} data_dir=${root_dir}/data/${dataset}
test_subset=dev-clean,dev-other,test-clean,test-other test_subset=dev-clean,dev-other,test-clean,test-other
# exp # exp
...@@ -81,13 +82,12 @@ fi ...@@ -81,13 +82,12 @@ fi
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_} config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag} exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name} model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -103,7 +103,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -103,7 +103,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir} mkdir -p ${data_dir}
fi fi
cmd="python ${root_dir}/examples/speech_to_text/prep_librispeech_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_librispeech_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
...@@ -146,28 +146,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -146,28 +146,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--task ${task} --task ${task}
...@@ -252,9 +250,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -252,9 +250,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${root_dir}/scripts/average_checkpoints.py cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir} --inputs ${model_dir}
--num-epoch-checkpoints ${n_average} --num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}" --output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd [[ $eval -eq 1 ]] && eval $cmd
...@@ -279,7 +277,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -279,7 +277,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ }) test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset} subset=${subset}
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--gen-subset ${subset} --gen-subset ${subset}
......
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3 ctc-weight: 0.3
post-process: sentencepiece
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
pds-fusion: True
train-subset: train_asr ctc-layer: 12
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_16 arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 2_2_6_2 pds-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_32 arch: pdss2t_transformer_s_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 5 pds-stages: 5
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 2_2_3_3_2 pds-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256 pds-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1_1 pds-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5 pds-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8 pds-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4 pds-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 3_3_3_3 pds-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
...@@ -24,7 +24,8 @@ stop_stage=0 ...@@ -24,7 +24,8 @@ stop_stage=0
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
root_dir=~/st/Fairseq-S2T root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
...@@ -41,13 +42,16 @@ lcrm=0 ...@@ -41,13 +42,16 @@ lcrm=0
tokenizer=0 tokenizer=0
use_raw_audio=0 use_raw_audio=0
use_specific_dict=0 use_specific_dict=1
specific_prefix=st specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=~/st/data/${dataset}/asr data_dir=${root_dir}/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON test_subset=tst-COMMON
# exp # exp
...@@ -59,7 +63,7 @@ exp_name= ...@@ -59,7 +63,7 @@ exp_name=
# config # config
train_config=ctc train_config=ctc
data_config=config_asr.yaml data_config=config.yaml
# training setting # training setting
fp16=1 fp16=1
...@@ -97,13 +101,12 @@ fi ...@@ -97,13 +101,12 @@ fi
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_} config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag} exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name} model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -114,11 +117,23 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -114,11 +117,23 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself. ### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases ### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation" echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
feature_zip=fbank80.zip
if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi
if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../feature_zip ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--task asr --task asr
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
...@@ -127,7 +142,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -127,7 +142,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--raw" --raw"
fi fi
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang} cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd cmd="$cmd
--asr-prefix ${asr_vocab_prefix}" --asr-prefix ${asr_vocab_prefix}"
fi fi
...@@ -147,6 +162,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -147,6 +162,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
...@@ -173,28 +193,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -173,28 +193,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--task ${task} --task ${task}
...@@ -278,12 +296,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -278,12 +296,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py if [[ ! -f ${model_dir}/${dec_model} ]]; then
--inputs ${model_dir} cmd="python ${code_dir}/scripts/average_checkpoints.py
--num-epoch-checkpoints ${n_average} --inputs ${model_dir}
--output ${model_dir}/${dec_model}" --num-best-checkpoints ${n_average}
echo -e "\033[34mRun command: \n${cmd} \033[0m" --output ${model_dir}/${dec_model}"
[[ $eval -eq 1 ]] && eval $cmd echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else else
dec_model=${dec_model} dec_model=${dec_model}
fi fi
...@@ -303,8 +323,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -303,8 +323,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=${test_subset//,/ } test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
subset=${subset}_asr subset=${subset}
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--gen-subset ${subset} --gen-subset ${subset}
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then ...@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1 exp_name=$1
fi fi
sacrebleu=1
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
...@@ -21,6 +22,7 @@ cmd="./run.sh ...@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2 --stop_stage 2
--gpu_num ${gpu_num} --gpu_num ${gpu_num}
--exp_name ${exp_name} --exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average} --n_average ${n_average}
--beam_size ${beam_size} --beam_size ${beam_size}
--len_penalty ${len_penalty} --len_penalty ${len_penalty}
......
...@@ -13,7 +13,7 @@ set -o pipefail ...@@ -13,7 +13,7 @@ set -o pipefail
export PYTHONIOENCODING=UTF-8 export PYTHONIOENCODING=UTF-8
eval=1 eval=1
time=$(date "+%m%d_%H%M") time=$(date "+%m%d")
stage=0 stage=0
stop_stage=0 stop_stage=0
...@@ -24,7 +24,8 @@ device=() ...@@ -24,7 +24,8 @@ device=()
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
root_dir=~/st/Fairseq-S2T root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
...@@ -42,12 +43,12 @@ tokenizer=0 ...@@ -42,12 +43,12 @@ tokenizer=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=st specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/ specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang} data_dir=${root_dir}/data/${dataset}/mt
train_subset=train train_subset=train
valid_subset=dev valid_subset=dev
trans_subset=tst-COMMON trans_subset=tst-COMMON
...@@ -70,6 +71,7 @@ step_valid=0 ...@@ -70,6 +71,7 @@ step_valid=0
bleu_valid=0 bleu_valid=0
# decoding setting # decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
n_average=10 n_average=10
beam_size=5 beam_size=5
...@@ -106,7 +108,6 @@ fi ...@@ -106,7 +108,6 @@ fi
# full path # full path
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_} config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag} exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
...@@ -128,7 +129,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -128,7 +129,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset} --splits ${train_subset},${valid_subset},${trans_subset}
...@@ -151,9 +152,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -151,9 +152,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{ {
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}" txt_dir=${org_data_dir}/data/${split}/txt
cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}" cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
fi fi
cmd="${cmd} cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
...@@ -166,7 +168,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -166,7 +168,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model --model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece --output_format=piece
< ${org_data_dir}/${lang}/data/${split}/txt/${split}.${tgt_lang} < ${txt_dir}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}" > ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -175,7 +177,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -175,7 +177,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done done
wait wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py cmd="python ${code_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang} --source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset} --trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset} --validpref ${data_dir}/data/${valid_subset}
...@@ -215,28 +217,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -215,28 +217,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
...@@ -330,12 +330,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -330,12 +330,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py if [[ ! -f ${model_dir}/${dec_model} ]]; then
--inputs ${model_dir} cmd="python ${code_dir}/scripts/average_checkpoints.py
--num-epoch-checkpoints ${n_average} --inputs ${model_dir}
--output ${model_dir}/${dec_model}" --num-best-checkpoints ${n_average}
echo -e "\033[34mRun command: \n${cmd} \033[0m" --output ${model_dir}/${dec_model}"
[[ $eval -eq 1 ]] && eval $cmd echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else else
dec_model=${dec_model} dec_model=${dec_model}
fi fi
...@@ -355,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -355,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ }) test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
--target-lang ${tgt_lang} --target-lang ${tgt_lang}
...@@ -366,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -366,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty} --lenpen ${len_penalty}
--post-process sentencepiece --post-process sentencepiece"
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
--tokenizer moses --tokenizer moses
--moses-source-lang ${src_lang} --moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}" --moses-target-lang ${tgt_lang}"
fi
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
......
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s arch: s2t_transformer_s
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -42,3 +26,6 @@ decoder-ffn-embed-dim: 2048 ...@@ -42,3 +26,6 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
attention-dropout: 0.1 attention-dropout: 0.1
activation-dropout: 0.1 activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3 ctc-weight: 0.3
\ No newline at end of file post-process: sentencepiece
\ No newline at end of file
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
pds-fusion: True
train-subset: train_st ctc-layer: 12
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -38,3 +24,6 @@ encoder-attention-heads: 4 ...@@ -38,3 +24,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16 arch: pdss2t_transformer_s_16
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 2_2_6_2 pds-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2 pds-ratios: 2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -53,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32 arch: pdss2t_transformer_s_32
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 5 pds-stages: 5
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 2_2_3_3_2 pds-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2 pds-ratios: 2_2_2_2_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256 pds-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1_1 pds-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5 pds-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8 pds-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4 pds-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -53,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8 arch: pdss2t_transformer_s_8
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 ctc-layer: 12
pyramid-layers: 3_3_3_3 pds-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -53,3 +37,6 @@ encoder-attention-heads: 4 ...@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate arch: s2t_sate
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
...@@ -43,6 +25,11 @@ text-encoder-layers: 6 ...@@ -43,6 +25,11 @@ text-encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
#macaron-style: True #macaron-style: True
#use-cnn-module: True #use-cnn-module: True
#cnn-module-kernel: 31 #cnn-module-kernel: 31
...@@ -52,20 +39,20 @@ acoustic-encoder: transformer ...@@ -52,20 +39,20 @@ acoustic-encoder: transformer
adapter: league adapter: league
encoder-embed-dim: 256 encoder-embed-dim: 256
pyramid-stages: 4 pds-stages: 4
#pyramid-dropout: 0 #pds-dropout: 0
pyramid-layers: 3_3_3_3 pds-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2 pds-ratios: 2_2_1_2
pyramid-fusion: True pds-fusion: True
pyramid-fusion-method: all_conv pds-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256 pds-embed-dims: 256_256_256_256
pyramid-ds-method: conv pds-ds-method: conv
pyramid-embed-norm: True pds-embed-norm: True
pyramid-position-embed: 1_1_1_1 pds-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4_4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 4
\ No newline at end of file
...@@ -3,13 +3,14 @@ ...@@ -3,13 +3,14 @@
gpu_num=1 gpu_num=1
data_dir= data_dir=
test_subset=(tst-COMMON) test_subset=(dev tst-COMMON)
exp_name= exp_name=
if [ "$#" -eq 1 ]; then if [ "$#" -eq 1 ]; then
exp_name=$1 exp_name=$1
fi fi
sacrebleu=1
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
...@@ -21,6 +22,7 @@ cmd="./run.sh ...@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2 --stop_stage 2
--gpu_num ${gpu_num} --gpu_num ${gpu_num}
--exp_name ${exp_name} --exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average} --n_average ${n_average}
--beam_size ${beam_size} --beam_size ${beam_size}
--len_penalty ${len_penalty} --len_penalty ${len_penalty}
...@@ -31,7 +33,7 @@ cmd="./run.sh ...@@ -31,7 +33,7 @@ cmd="./run.sh
if [[ -n ${data_dir} ]]; then if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}" cmd="$cmd --data_dir ${data_dir}"
fi fi
if [[ ${#test_subset[@]} -eq 0 ]]; then if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g') subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}" cmd="$cmd --test_subset ${subsets}"
fi fi
......
...@@ -24,7 +24,8 @@ stop_stage=0 ...@@ -24,7 +24,8 @@ stop_stage=0
gpu_num=8 gpu_num=8
update_freq=1 update_freq=1
root_dir=~/st/Fairseq-S2T root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD pwd_dir=$PWD
# dataset # dataset
...@@ -41,15 +42,19 @@ share_dict=1 ...@@ -41,15 +42,19 @@ share_dict=1
speed_perturb=0 speed_perturb=0
lcrm=0 lcrm=0
tokenizer=0 tokenizer=0
use_raw_audio=0
use_specific_dict=0 use_specific_dict=0
specific_prefix=valid specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=~/st/data/${dataset}/st data_dir=${root_dir}/data/${dataset}/st
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON test_subset=tst-COMMON
# exp # exp
...@@ -60,7 +65,7 @@ exp_tag=baseline ...@@ -60,7 +65,7 @@ exp_tag=baseline
exp_name= exp_name=
# config # config
train_config=ctc train_config=base,ctc
# training setting # training setting
fp16=1 fp16=1
...@@ -69,15 +74,16 @@ step_valid=0 ...@@ -69,15 +74,16 @@ step_valid=0
bleu_valid=0 bleu_valid=0
# decoding setting # decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt dec_model=checkpoint_best.pt
n_average=10 n_average=10
beam_size=5 beam_size=5
len_penalty=1.0 len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml data_config=config_share.yaml
else else
data_config=config_st.yaml data_config=config.yaml
fi fi
if [[ ${speed_perturb} -eq 1 ]]; then if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp data_dir=${data_dir}_sp
...@@ -95,18 +101,21 @@ if [[ ${tokenizer} -eq 1 ]]; then ...@@ -95,18 +101,21 @@ if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok exp_prefix=${exp_prefix}_tok
fi fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_} config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag} exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag} exp_name=${exp_name}_${extra_tag}
fi fi
fi fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name} model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download" echo "stage -1: Data Download"
...@@ -117,37 +126,49 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -117,37 +126,49 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself. ### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases ### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation" echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}/${lang} mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi fi
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py # create ASR vocabulary if necessary
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}/asr4st
--task asr --task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}" --vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation" echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--task st --task st
--add-src --add-src
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance --cmvn-type utterance
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang} cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang} cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
cmd="$cmd cmd="$cmd
--share --share
...@@ -182,9 +203,16 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -182,9 +203,16 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang} if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training" echo "stage 1: ST Network Training"
...@@ -210,28 +238,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -210,28 +238,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir} cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir} cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }" config_list="${train_config//,/ }"
idx=0 idx=1
for config in ${config_list[@]} for config in ${config_list[@]}
do do
config_path=$pwd_dir/conf/${config}.yaml config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}" echo "No config file ${config_path}"
exit exit
fi fi
cp ${config_path} ${model_dir} cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}" --train-config${idx} ${config_path}"
fi
idx=$((idx + 1)) idx=$((idx + 1))
done done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--task ${task} --task ${task}
...@@ -324,12 +350,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -324,12 +350,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models # Average models
dec_model=avg_${n_average}_checkpoint.pt dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py if [[ ! -f ${model_dir}/${dec_model} ]]; then
--inputs ${model_dir} cmd="python ${code_dir}/scripts/average_checkpoints.py
--num-epoch-checkpoints ${n_average} --inputs ${model_dir}
--output ${model_dir}/${dec_model}" --num-best-checkpoints ${n_average}
echo -e "\033[34mRun command: \n${cmd} \033[0m" --output ${model_dir}/${dec_model}"
[[ $eval -eq 1 ]] && eval $cmd echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else else
dec_model=${dec_model} dec_model=${dec_model}
fi fi
...@@ -348,9 +376,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -348,9 +376,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ } test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do for subset in ${test_subset[@]}; do
subset=${subset}_st subset=${subset}
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--config-yaml ${data_config} --config-yaml ${data_config}
--gen-subset ${subset} --gen-subset ${subset}
...@@ -359,8 +387,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -359,8 +387,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--results-path ${model_dir} --results-path ${model_dir}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty} --lenpen ${len_penalty}"
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu" --scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
......
set -e
eval=1
lcrm=0
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=~/st/data/test
vocab_dir=~/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
subsets=(2019)
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#max-encoder-relative-length: 100
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing ASR Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
lang=${src_lang}
dataset=asr
task=speech_to_text
vocab_type=unigram
vocab_size=5000
speed_perturb=0
lcrm=1
tokenizer=0
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=test
test_subset=test
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
data_config=config_asr.yaml
# training setting
fp16=1
max_tokens=40000
step_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--splits ${train_split},${valid_split},${test_split}
--lang ${lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ASR Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ASR Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}_asr
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
#config_list=(ctc)
#config_list=(base conformer)
#config_list=(pds_base_16)
config_list=(pds_base_16 conformer rpr)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test
vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
src_vocab_prefix=spm_unigram32000_share
tgt_vocab_prefix=spm_unigram32000_share
src_lang=en
tgt_lang=de
tokenize=1
splits=(newstest2014 newstest2016)
for split in ${splits[@]}; do
src_file=${data_dir}/${split}.${src_lang}
tgt_file=${data_dir}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.tok
tgt_file=${tgt_file}.tok
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${data_dir}/final
cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${data_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${data_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
use-enc-dlcl: True
use-dec-dlcl: True
#encoder-attention-type: rel_selfattn
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
\ No newline at end of file
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
fi
echo $cmd
eval $cmd
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
line = line.replace(" ", "")
print(line)
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=mt
task=translation
vocab_type=unigram
vocab_size=10000
share_dict=1
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang}
train_subset=train
valid_subset=dev
trans_subset=tst-COMMON
test_subset=test
# exp
exp_prefix=${time}
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=base_s
# training setting
fp16=1
max_tokens=4096
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${specific_prefix}_${exp_prefix}
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
data_dir=${data_dir}/${vocab_type}${vocab_size}
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
fi
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${tokenizer} -eq 1 ]]; then
train_subset=${train_subset}.tok
valid_subset=${valid_subset}.tok
trans_subset=${trans_subset}.tok
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
# full path
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "stage 0: MT Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset}
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
else
cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
fi
fi
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
}&
done
wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset}
--testpref ${data_dir}/data/${trans_subset}
--destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/data-bin
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: MT Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--gen-subset ${subset}
--task ${task}
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--post-process sentencepiece
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=1
update_freq=1
max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/test
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
splits=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g')
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task st
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--st-spm-prefix ${st_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
ctc-weight: 0.3
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -eq 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml
else
data_config=config_st.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#config_list=(pds_base)
#config_list=(pds_base conformer)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
...@@ -13,6 +13,8 @@ from itertools import groupby ...@@ -13,6 +13,8 @@ from itertools import groupby
from tempfile import NamedTemporaryFile from tempfile import NamedTemporaryFile
import string import string
import csv import csv
import yaml
import copy
import numpy as np import numpy as np
import pandas as pd import pandas as pd
...@@ -50,8 +52,6 @@ class AudioDataset(Dataset): ...@@ -50,8 +52,6 @@ class AudioDataset(Dataset):
tokenizer: bool = False) -> None: tokenizer: bool = False) -> None:
_root = Path(root) / "data" / split _root = Path(root) / "data" / split
wav_root, txt_root = _root / "wav", _root / "txt" wav_root, txt_root = _root / "wav", _root / "txt"
if tokenizer:
txt_root = _root / "txt.tok"
assert wav_root.is_dir() and txt_root.is_dir(), (_root, wav_root, txt_root) assert wav_root.is_dir() and txt_root.is_dir(), (_root, wav_root, txt_root)
self.use_raw = use_raw self.use_raw = use_raw
...@@ -62,25 +62,30 @@ class AudioDataset(Dataset): ...@@ -62,25 +62,30 @@ class AudioDataset(Dataset):
yaml_file = txt_root / f"{split}.yaml" yaml_file = txt_root / f"{split}.yaml"
if yaml_file.is_file(): if yaml_file.is_file():
self.mode = "yaml" self.mode = "yaml"
try:
import yaml
except ImportError:
print("Please install PyYAML to load the MuST-C YAML files")
with open(yaml_file) as f: with open(yaml_file) as f:
segments = yaml.load(f, Loader=yaml.BaseLoader) segments = yaml.load(f, Loader=yaml.BaseLoader)
total_length = len(segments)
if 0 < self.size < total_length:
segments = segments[:self.size]
# for idx, seg in enumerate(content):
# segments[idx] = seg
# if 0 < self.size < idx:
# break
else: else:
self.mode = "easy" self.mode = "easy"
segments = dict()
audio_file = txt_root / f"{split}.audio" audio_file = txt_root / f"{split}.audio"
assert audio_file.is_file(), audio_file assert audio_file.is_file(), audio_file
with open(audio_file) as f: with open(audio_file) as f:
audios = [line.strip() for line in f.readlines()] audios = [line.strip() for line in f.readlines()]
total_length = len(audios)
segments = dict() if 0 < self.size < total_length:
audios = audios[:self.size]
for idx, audio in enumerate(audios): for idx, audio in enumerate(audios):
segments[idx] = {"audio": audio} segments[idx] = {"audio": audio}
if 0 < self.size < idx:
break
# Load source and target utterances # Load source and target utterances
self.have_src_utt = False self.have_src_utt = False
...@@ -88,31 +93,44 @@ class AudioDataset(Dataset): ...@@ -88,31 +93,44 @@ class AudioDataset(Dataset):
for _lang in [src_lang, tgt_lang]: for _lang in [src_lang, tgt_lang]:
if _lang is None: if _lang is None:
continue continue
if Path.exists(txt_root / f"{split}.{_lang}"): txt_path = txt_root / f"{split}.{_lang}"
if tokenizer:
txt_path = txt_root / f"{split}.{_lang}.tok"
if Path.exists(txt_path):
if _lang == src_lang: if _lang == src_lang:
self.have_src_utt = True self.have_src_utt = True
else: else:
self.have_tgt_utt = True self.have_tgt_utt = True
with open(txt_root / f"{split}.{_lang}") as f: with open(txt_path) as f:
utterances = [r.strip() for r in f] utterances = [r.strip() for r in f]
assert len(audios) == len(utterances) assert total_length == len(utterances), (total_length, len(utterances))
if 0 < self.size < total_length:
utterances = utterances[:self.size]
for idx, u in enumerate(utterances): for idx, u in enumerate(utterances):
segments[idx][_lang] = u segments[idx][_lang] = u
if 0 < self.size < idx:
break
# Gather info # Gather info
self.data = dict() self.data = dict()
if self.mode == "easy": if self.mode == "easy":
real_idx = 0
for idx, v in segments.items(): for idx, v in segments.items():
audio_name = v["audio"]
v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
if self.speed_perturb is not None: if self.speed_perturb is not None:
for perturb in self.speed_perturb: for perturb in self.speed_perturb:
v["sp"] = perturb sp_item = copy.deepcopy(v)
v["id"] = f"{v['audio']}_sp{perturb}" sp_item["perturb"] = perturb
sp_item["id"] = f"{audio_name}_sp{perturb}"
self.data[real_idx] = sp_item
real_idx += 1
else: else:
v["id"] = v['audio'] v["id"] = audio_name
v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav" self.data[real_idx] = v
self.data[idx] = v real_idx += 1
if 0 < self.size <= real_idx:
break
elif self.mode == "yaml": elif self.mode == "yaml":
idx = 0 idx = 0
...@@ -129,14 +147,14 @@ class AudioDataset(Dataset): ...@@ -129,14 +147,14 @@ class AudioDataset(Dataset):
item["audio"] = wav_path.as_posix() item["audio"] = wav_path.as_posix()
item["offset"] = offset item["offset"] = offset
item["n_frames"] = n_frames item["n_frames"] = n_frames
item["sample_rate"] = sample_rate, item["sample_rate"] = sample_rate
item[src_lang] = segment[src_lang] item[src_lang] = segment[src_lang]
if tgt_lang is not None: if tgt_lang is not None:
item[tgt_lang] = segment[tgt_lang] item[tgt_lang] = segment[tgt_lang]
if self.speed_perturb is not None: if self.speed_perturb is not None:
for perturb in self.speed_perturb: for perturb in self.speed_perturb:
sp_item = item sp_item = copy.deepcopy(item)
sp_item["id"] = f"{_id}_sp{perturb}" sp_item["id"] = f"{_id}_sp{perturb}"
sp_item["perturb"] = perturb sp_item["perturb"] = perturb
self.data[idx] = sp_item self.data[idx] = sp_item
...@@ -145,7 +163,7 @@ class AudioDataset(Dataset): ...@@ -145,7 +163,7 @@ class AudioDataset(Dataset):
item["id"] = _id item["id"] = _id
self.data[idx] = item self.data[idx] = item
idx += 1 idx += 1
if 0 < self.size < idx: if 0 < self.size <= idx:
break break
def __getitem__(self, n: int): def __getitem__(self, n: int):
...@@ -155,7 +173,7 @@ class AudioDataset(Dataset): ...@@ -155,7 +173,7 @@ class AudioDataset(Dataset):
item = self.data[n] item = self.data[n]
audio = item["audio"] audio = item["audio"]
if getattr(item, "n_frames", False) and getattr(item, "sample_rate", False): if item.get("n_frames", False) and item.get("sample_rate", False):
n_frames = item["n_frames"] n_frames = item["n_frames"]
sample_rate = item["sample_rate"] sample_rate = item["sample_rate"]
else: else:
...@@ -164,16 +182,19 @@ class AudioDataset(Dataset): ...@@ -164,16 +182,19 @@ class AudioDataset(Dataset):
n_frames = info.num_frames n_frames = info.num_frames
waveform = None waveform = None
if item.get("perturb", False):
n_frames = n_frames / item['perturb']
if need_waveform: if need_waveform:
if getattr(item, "offset", False): offset = item.get('offset', False)
if offset:
waveform, sample_rate = torchaudio.load(audio, waveform, sample_rate = torchaudio.load(audio,
frame_offset=item["sample_rate"], frame_offset=offset,
num_frames=item["n_frames"]) num_frames=item["n_frames"])
else: else:
waveform, sample_rate = torchaudio.load(audio) waveform, sample_rate = torchaudio.load(audio)
if getattr(item, "perturb", False): if item.get("perturb", False):
n_frames = n_frames / item['perturb']
effects = [ effects = [
["speed", f"{item['perturb']}"], ["speed", f"{item['perturb']}"],
["rate", f"{sample_rate}"] ["rate", f"{sample_rate}"]
...@@ -204,6 +225,7 @@ def process(args): ...@@ -204,6 +225,7 @@ def process(args):
output_root = Path(args.output_root).absolute() output_root = Path(args.output_root).absolute()
# Extract features # Extract features
datasets = dict()
use_raw = args.raw use_raw = args.raw
size = args.size size = args.size
if args.speed_perturb: if args.speed_perturb:
...@@ -233,6 +255,8 @@ def process(args): ...@@ -233,6 +255,8 @@ def process(args):
src_lang, tgt_lang, split, src_lang, tgt_lang, split,
args.speed_perturb, size, use_raw, args.speed_perturb, size, use_raw,
args.tokenizer) args.tokenizer)
if split not in datasets:
datasets[split] = dataset
if is_train_split and args.cmvn_type == "global": if is_train_split and args.cmvn_type == "global":
print("And estimating cepstral mean and variance stats...") print("And estimating cepstral mean and variance stats...")
...@@ -287,10 +311,13 @@ def process(args): ...@@ -287,10 +311,13 @@ def process(args):
is_train_split = split.startswith("train") is_train_split = split.startswith("train")
manifest = {c: [] for c in MANIFEST_COLUMNS} manifest = {c: [] for c in MANIFEST_COLUMNS}
dataset = AudioDataset(root.as_posix(), if split in datasets:
src_lang, tgt_lang, split, dataset = datasets[split]
args.speed_perturb, size, use_raw, else:
args.tokenizer) dataset = AudioDataset(root.as_posix(),
src_lang, tgt_lang, split,
args.speed_perturb, size, use_raw,
args.tokenizer)
if args.task == "st" and args.add_src and dataset.have_src_utt: if args.task == "st" and args.add_src and dataset.have_src_utt:
manifest["src_text"] = [] manifest["src_text"] = []
for idx in tqdm(range(len(dataset))): for idx in tqdm(range(len(dataset))):
...@@ -303,7 +330,7 @@ def process(args): ...@@ -303,7 +330,7 @@ def process(args):
audio_path = item["audio"] audio_path = item["audio"]
# add offset and frames info # add offset and frames info
if getattr(item, "offset", False): if item.get("offset", False):
audio_path = f"{audio_path}:{item['offset']}:{n_frames}" audio_path = f"{audio_path}:{item['offset']}:{n_frames}"
manifest["audio"].append(audio_path) manifest["audio"].append(audio_path)
else: else:
...@@ -371,7 +398,7 @@ def process(args): ...@@ -371,7 +398,7 @@ def process(args):
if len(train_text) == 0: if len(train_text) == 0:
print("Loading the training text to build dictionary...") print("Loading the training text to build dictionary...")
for split in args.SPLITS: for split in splits:
if split.startswith("train"): if split.startswith("train"):
csv_path = output_root / f"{split}.tsv" csv_path = output_root / f"{split}.tsv"
with open(csv_path) as f: with open(csv_path) as f:
...@@ -384,18 +411,18 @@ def process(args): ...@@ -384,18 +411,18 @@ def process(args):
quoting=csv.QUOTE_NONE, quoting=csv.QUOTE_NONE,
) )
if task == "st" and args.add_src and args.share: if task == "st" and args.add_src and args.share:
for e in reader: for e in reader:
src_utt = dict(e)["src_text"] src_utt = dict(e)["src_text"]
if args.lowercase_src: if args.lowercase_src:
src_utt = src_utt.lower() src_utt = src_utt.lower()
if args.rm_punc_src: if args.rm_punc_src:
for w in string.punctuation: for w in string.punctuation:
src_utt = src_utt.replace(w, "") src_utt = src_utt.replace(w, "")
src_utt = " ".join(src_utt.split(" ")) src_utt = " ".join(src_utt.split(" "))
train_text.append(src_utt) train_text.append(src_utt)
tgt_text = [dict(e)["tgt_text"] for e in reader] tgt_text = [dict(e)["tgt_text"] for e in reader]
train_text.extend(tgt_text) train_text.extend(tgt_text)
with NamedTemporaryFile(mode="w") as f: with NamedTemporaryFile(mode="w") as f:
for t in train_text: for t in train_text:
......
...@@ -33,8 +33,8 @@ class MTDataset(Dataset): ...@@ -33,8 +33,8 @@ class MTDataset(Dataset):
""" """
def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None: def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None:
_root = Path(root) / "data" _root = Path(root) / "data" / split
txt_root = _root txt_root = _root / "txt" if (_root / "txt").is_dir() else _root
assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root) assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
# Load source and target text # Load source and target text
self.data = [] self.data = []
......
...@@ -23,7 +23,6 @@ from fairseq.file_io import PathManager ...@@ -23,7 +23,6 @@ from fairseq.file_io import PathManager
from fairseq.models import FairseqDecoder, FairseqEncoder from fairseq.models import FairseqDecoder, FairseqEncoder
from omegaconf import Container, DictConfig, open_dict, OmegaConf from omegaconf import Container, DictConfig, open_dict, OmegaConf
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -62,23 +61,28 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss): ...@@ -62,23 +61,28 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
suffix = trainer.checkpoint_suffix suffix = trainer.checkpoint_suffix
checkpoint_conds = collections.OrderedDict() checkpoint_conds = collections.OrderedDict()
checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = ( checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0 end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
) )
checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = ( checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
not end_of_epoch not end_of_epoch
and cfg.save_interval_updates > 0 and cfg.save_interval_updates > 0
and updates % cfg.save_interval_updates == 0 and updates % cfg.save_interval_updates == 0
) )
checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and ( checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
not hasattr(save_checkpoint, "best") not hasattr(save_checkpoint, "best")
or is_better(val_loss, save_checkpoint.best) or is_better(val_loss, save_checkpoint.best)
) )
if val_loss is not None and cfg.keep_best_checkpoints > 0: if val_loss is not None and cfg.keep_best_checkpoints > 0:
if not end_of_epoch and cfg.save_interval_updates > 0 and updates % cfg.save_interval_updates == 0:
epoch_or_step = updates
else:
epoch_or_step = epoch
checkpoint_conds[ checkpoint_conds[
"checkpoint.best_{}_{:.2f}.pt".format(cfg.best_checkpoint_metric, val_loss) "checkpoint.best_{}_{}_{:.3f}.pt".format(cfg.best_checkpoint_metric, epoch_or_step, val_loss)
] = not hasattr(save_checkpoint, "best") or is_better( ] = True
val_loss, save_checkpoint.best # not hasattr(save_checkpoint, "best") or is_better(
) # val_loss, save_checkpoint.best
# )
checkpoint_conds[ checkpoint_conds[
"checkpoint_last{}.pt".format(suffix) "checkpoint_last{}.pt".format(suffix)
] = not cfg.no_last_checkpoints ] = not cfg.no_last_checkpoints
...@@ -117,14 +121,14 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss): ...@@ -117,14 +121,14 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
checkpoints = checkpoint_paths( checkpoints = checkpoint_paths(
cfg.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt" cfg.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt"
) )
for old_chk in checkpoints[cfg.keep_interval_updates :]: for old_chk in checkpoints[cfg.keep_interval_updates:]:
if os.path.lexists(old_chk): if os.path.lexists(old_chk):
os.remove(old_chk) os.remove(old_chk)
if cfg.keep_last_epochs > 0: if cfg.keep_last_epochs > 0:
# remove old epoch checkpoints; checkpoints are sorted in descending order # remove old epoch checkpoints; checkpoints are sorted in descending order
checkpoints = checkpoint_paths(cfg.save_dir, pattern=r"checkpoint(\d+)\.pt") checkpoints = checkpoint_paths(cfg.save_dir, pattern=r"checkpoint(\d+)\.pt")
for old_chk in checkpoints[cfg.keep_last_epochs :]: for old_chk in checkpoints[cfg.keep_last_epochs:]:
if os.path.lexists(old_chk): if os.path.lexists(old_chk):
os.remove(old_chk) os.remove(old_chk)
...@@ -132,13 +136,13 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss): ...@@ -132,13 +136,13 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
# only keep the best N checkpoints according to validation metric # only keep the best N checkpoints according to validation metric
checkpoints = checkpoint_paths( checkpoints = checkpoint_paths(
cfg.save_dir, cfg.save_dir,
pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format( pattern=r"checkpoint\.best_{}_\d+_(\d+\.?\d*)\.pt".format(
cfg.best_checkpoint_metric cfg.best_checkpoint_metric
), ),
) )
if not cfg.maximize_best_checkpoint_metric: if not cfg.maximize_best_checkpoint_metric:
checkpoints = checkpoints[::-1] checkpoints = checkpoints[::-1]
for old_chk in checkpoints[cfg.keep_best_checkpoints :]: for old_chk in checkpoints[cfg.keep_best_checkpoints:]:
if os.path.lexists(old_chk): if os.path.lexists(old_chk):
os.remove(old_chk) os.remove(old_chk)
...@@ -158,7 +162,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args): ...@@ -158,7 +162,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
reset_dataloader = cfg.reset_dataloader reset_dataloader = cfg.reset_dataloader
if cfg.finetune_from_model is not None and ( if cfg.finetune_from_model is not None and (
reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
): ):
raise ValueError( raise ValueError(
"--finetune-from-model can not be set together with either --reset-optimizer" "--finetune-from-model can not be set together with either --reset-optimizer"
...@@ -167,7 +171,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args): ...@@ -167,7 +171,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
suffix = trainer.checkpoint_suffix suffix = trainer.checkpoint_suffix
if ( if (
cfg.restore_file == "checkpoint_last.pt" cfg.restore_file == "checkpoint_last.pt"
): # default value of restore_file is 'checkpoint_last.pt' ): # default value of restore_file is 'checkpoint_last.pt'
checkpoint_path = os.path.join( checkpoint_path = os.path.join(
cfg.save_dir, "checkpoint_last{}.pt".format(suffix) cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
...@@ -210,10 +214,10 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args): ...@@ -210,10 +214,10 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
) )
if ( if (
extra_state is not None extra_state is not None
and "best" in extra_state and "best" in extra_state
and not reset_optimizer and not reset_optimizer
and not reset_meters and not reset_meters
): ):
save_checkpoint.best = extra_state["best"] save_checkpoint.best = extra_state["best"]
...@@ -297,13 +301,13 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False): ...@@ -297,13 +301,13 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):
def load_model_ensemble( def load_model_ensemble(
filenames, filenames,
arg_overrides: Optional[Dict[str, Any]] = None, arg_overrides: Optional[Dict[str, Any]] = None,
task=None, task=None,
strict=True, strict=True,
suffix="", suffix="",
num_shards=1, num_shards=1,
state=None, state=None,
): ):
"""Loads an ensemble of models. """Loads an ensemble of models.
...@@ -314,7 +318,7 @@ def load_model_ensemble( ...@@ -314,7 +318,7 @@ def load_model_ensemble(
task (fairseq.tasks.FairseqTask, optional): task to use for loading task (fairseq.tasks.FairseqTask, optional): task to use for loading
""" """
assert not ( assert not (
strict and num_shards > 1 strict and num_shards > 1
), "Cannot load state dict with strict=True and checkpoint shards > 1" ), "Cannot load state dict with strict=True and checkpoint shards > 1"
ensemble, args, _task = load_model_ensemble_and_task( ensemble, args, _task = load_model_ensemble_and_task(
filenames, filenames,
...@@ -329,20 +333,20 @@ def load_model_ensemble( ...@@ -329,20 +333,20 @@ def load_model_ensemble(
def load_model_ensemble_and_task( def load_model_ensemble_and_task(
filenames, filenames,
arg_overrides: Optional[Dict[str, Any]] = None, arg_overrides: Optional[Dict[str, Any]] = None,
task=None, task=None,
strict=True, strict=True,
suffix="", suffix="",
num_shards=1, num_shards=1,
state=None, state=None,
): ):
assert state is None or len(filenames) == 1 assert state is None or len(filenames) == 1
from fairseq import tasks from fairseq import tasks
assert not ( assert not (
strict and num_shards > 1 strict and num_shards > 1
), "Cannot load state dict with strict=True and checkpoint shards > 1" ), "Cannot load state dict with strict=True and checkpoint shards > 1"
ensemble = [] ensemble = []
cfg = None cfg = None
...@@ -483,7 +487,7 @@ def _upgrade_state_dict(state): ...@@ -483,7 +487,7 @@ def _upgrade_state_dict(state):
state["optimizer_history"][-1]["num_updates"] = 0 state["optimizer_history"][-1]["num_updates"] = 0
# old model checkpoints may not have separate source/target positions # old model checkpoints may not have separate source/target positions
if "args" in state and hasattr(state["args"], "max_positions") and not hasattr( if "args" in state and hasattr(state["args"], "max_positions") and not hasattr(
state["args"], "max_source_positions" state["args"], "max_source_positions"
): ):
state["args"].max_source_positions = state["args"].max_positions state["args"].max_source_positions = state["args"].max_positions
state["args"].max_target_positions = state["args"].max_positions state["args"].max_target_positions = state["args"].max_positions
...@@ -518,14 +522,14 @@ def _upgrade_state_dict(state): ...@@ -518,14 +522,14 @@ def _upgrade_state_dict(state):
del state["args"].min_lr del state["args"].min_lr
# binary_cross_entropy => wav2vec criterion # binary_cross_entropy => wav2vec criterion
if ( if (
hasattr(state["args"], "criterion") hasattr(state["args"], "criterion")
and state["args"].criterion == "binary_cross_entropy" and state["args"].criterion == "binary_cross_entropy"
): ):
state["args"].criterion = "wav2vec" state["args"].criterion = "wav2vec"
# speech_pretraining => audio pretraining # speech_pretraining => audio pretraining
if ( if (
hasattr(state["args"], "task") hasattr(state["args"], "task")
and state["args"].task == "speech_pretraining" and state["args"].task == "speech_pretraining"
): ):
state["args"].task = "audio_pretraining" state["args"].task = "audio_pretraining"
# audio_cpc => wav2vec # audio_cpc => wav2vec
...@@ -536,9 +540,9 @@ def _upgrade_state_dict(state): ...@@ -536,9 +540,9 @@ def _upgrade_state_dict(state):
state["args"].lr = [state["args"].lr] state["args"].lr = [state["args"].lr]
# convert task data arg to a string instead of List[string] # convert task data arg to a string instead of List[string]
if ( if (
hasattr(state["args"], "data") hasattr(state["args"], "data")
and isinstance(state["args"].data, list) and isinstance(state["args"].data, list)
and len(state["args"].data) > 0 and len(state["args"].data) > 0
): ):
state["args"].data = state["args"].data[0] state["args"].data = state["args"].data[0]
...@@ -549,23 +553,23 @@ def _upgrade_state_dict(state): ...@@ -549,23 +553,23 @@ def _upgrade_state_dict(state):
with open_dict(cfg): with open_dict(cfg):
# any upgrades for Hydra-based configs # any upgrades for Hydra-based configs
if ( if (
"task" in cfg "task" in cfg
and "eval_wer_config" in cfg.task and "eval_wer_config" in cfg.task
and isinstance(cfg.task.eval_wer_config.print_alignment, bool) and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
): ):
cfg.task.eval_wer_config.print_alignment = "hard" cfg.task.eval_wer_config.print_alignment = "hard"
if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool): if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
cfg.generation.print_alignment = "hard" cfg.generation.print_alignment = "hard"
if ( if (
"model" in cfg "model" in cfg
and "w2v_args" in cfg.model and "w2v_args" in cfg.model
and cfg.model.w2v_args is not None and cfg.model.w2v_args is not None
and ( and (
hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
) )
and isinstance( and isinstance(
cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
) )
): ):
cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard" cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"
...@@ -644,9 +648,9 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]): ...@@ -644,9 +648,9 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
layer_name layer_name
) )
new_state_key = ( new_state_key = (
layer_name[: substitution_match.start(1)] layer_name[: substitution_match.start(1)]
+ new_layer_number + new_layer_number
+ layer_name[substitution_match.end(1) :] + layer_name[substitution_match.end(1):]
) )
new_state_dict[new_state_key] = state_dict[layer_name] new_state_dict[new_state_key] = state_dict[layer_name]
...@@ -666,7 +670,7 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]): ...@@ -666,7 +670,7 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
def load_pretrained_component_from_model( def load_pretrained_component_from_model(
component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True, component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True,
): ):
""" """
Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
...@@ -699,7 +703,7 @@ def load_pretrained_component_from_model( ...@@ -699,7 +703,7 @@ def load_pretrained_component_from_model(
for key in state["model"].keys(): for key in state["model"].keys():
if key.startswith(component_type): if key.startswith(component_type):
# encoder.input_layers.0.0.weight --> input_layers.0.0.weight # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
component_subkey = key[len(component_type) + 1 :] component_subkey = key[len(component_type) + 1:]
component_state_dict[component_subkey] = state["model"][key] component_state_dict[component_subkey] = state["model"][key]
mismatch_keys = [] mismatch_keys = []
......
...@@ -251,7 +251,7 @@ class CtcCriterion(FairseqCriterion): ...@@ -251,7 +251,7 @@ class CtcCriterion(FairseqCriterion):
if c_total > 0: if c_total > 0:
metrics.log_derived( metrics.log_derived(
"uer", "cer",
lambda meters: safe_round( lambda meters: safe_round(
meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
) )
......
...@@ -92,14 +92,14 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -92,14 +92,14 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
logging_output["total"] = utils.item(total.data) logging_output["total"] = utils.item(total.data)
if self.ctc_weight > 0: if self.ctc_weight > 0:
ctc_loss = self.compute_ctc_loss(model, sample, encoder_out) ctc_loss, logging_output = self.compute_ctc_loss(model, sample, encoder_out, logging_output)
logging_output["ctc_loss"] = utils.item(ctc_loss.data) logging_output["ctc_loss"] = utils.item(ctc_loss.data)
loss = (1 - self.ctc_weight) * loss + self.ctc_weight * ctc_loss loss = (1 - self.ctc_weight) * loss + self.ctc_weight * ctc_loss
logging_output["loss"] = utils.item(loss.data) if reduce else loss.data logging_output["loss"] = utils.item(loss.data) if reduce else loss.data
return loss, sample_size, logging_output return loss, sample_size, logging_output
def compute_ctc_loss(self, model, sample, encoder_out): def compute_ctc_loss(self, model, sample, encoder_out, logging_output):
transcript = sample["transcript"] transcript = sample["transcript"]
ctc_logit = encoder_out["ctc_logit"][0] ctc_logit = encoder_out["ctc_logit"][0]
lprobs = model.get_normalized_probs( lprobs = model.get_normalized_probs(
...@@ -124,9 +124,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -124,9 +124,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
transcript_lengths, transcript_lengths,
) )
logging_output = { logging_output["ctc_loss"] = utils.item(loss.data)
"ctc_loss": utils.item(loss.data), # * sample['ntokens'],
}
if not model.training: if not model.training:
import editdistance import editdistance
...@@ -182,7 +180,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -182,7 +180,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
logging_output["c_errors"] = c_err logging_output["c_errors"] = c_err
logging_output["c_total"] = c_len logging_output["c_total"] = c_len
return loss return loss, logging_output
@staticmethod @staticmethod
def reduce_metrics(logging_outputs) -> None: def reduce_metrics(logging_outputs) -> None:
...@@ -205,18 +203,20 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -205,18 +203,20 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
metrics.log_scalar( metrics.log_scalar(
"loss", loss_sum / sample_size / math.log(2), sample_size, round=3 "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
) )
metrics.log_scalar( if trans_loss_sum != loss_sum:
"trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3 metrics.log_scalar(
) "trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3
)
metrics.log_scalar( metrics.log_scalar(
"nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3 "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
) )
metrics.log_scalar( if ctc_loss_sum > 0:
"ctc_loss", metrics.log_scalar(
ctc_loss_sum / sample_size / math.log(2), "ctc_loss",
sample_size, ctc_loss_sum / sample_size / math.log(2),
round=3, sample_size,
) round=3,
)
metrics.log_derived( metrics.log_derived(
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg) "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
) )
...@@ -250,7 +250,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC( ...@@ -250,7 +250,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
if c_total > 0: if c_total > 0:
metrics.log_derived( metrics.log_derived(
"uer", "cer",
lambda meters: safe_round( lambda meters: safe_round(
meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3 meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
) )
......
...@@ -6,7 +6,10 @@ import torch ...@@ -6,7 +6,10 @@ import torch
import torchaudio import torchaudio
def get_waveform( def get_waveform(
path_or_fp: Union[str, BinaryIO], normalization=True path_or_fp: Union[str, BinaryIO],
normalization=True,
offset=None,
size=None
) -> Tuple[np.ndarray, int]: ) -> Tuple[np.ndarray, int]:
"""Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC. """Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC.
...@@ -19,7 +22,10 @@ def get_waveform( ...@@ -19,7 +22,10 @@ def get_waveform(
if ext not in {".flac", ".wav"}: if ext not in {".flac", ".wav"}:
raise ValueError(f"Unsupported audio format: {ext}") raise ValueError(f"Unsupported audio format: {ext}")
waveform, sample_rate = torchaudio.load(path_or_fp) if offset is not None and size is not None:
waveform, sample_rate = torchaudio.load(path_or_fp, frame_offset=offset, num_frames=size)
else:
waveform, sample_rate = torchaudio.load(path_or_fp)
waveform = waveform.squeeze().numpy() waveform = waveform.squeeze().numpy()
if not normalization: if not normalization:
...@@ -73,12 +79,17 @@ def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarr ...@@ -73,12 +79,17 @@ def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarr
return None return None
def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray: def get_fbank(
path_or_fp: Union[str, BinaryIO],
n_bins=80,
offset=None,
size=None,
) -> np.ndarray:
"""Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
(faster CPP implementation) to TorchAudio (Python implementation). Note that (faster CPP implementation) to TorchAudio (Python implementation). Note that
Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
waveform should not be normalized.""" waveform should not be normalized."""
sound, sample_rate = get_waveform(path_or_fp, normalization=False) sound, sample_rate = get_waveform(path_or_fp, normalization=False, offset=offset, size=size)
features = _get_kaldi_fbank(sound, sample_rate, n_bins) features = _get_kaldi_fbank(sound, sample_rate, n_bins)
if features is None: if features is None:
......
...@@ -182,6 +182,14 @@ def get_features_or_waveform_from_uncompressed_zip( ...@@ -182,6 +182,14 @@ def get_features_or_waveform_from_uncompressed_zip(
return features_or_waveform return features_or_waveform
def get_features_or_waveform_from_audio(
path, offset, size, need_waveform=False
):
assert path.endswith(".wav")
features_or_waveform = get_waveform(path, offset=offset, size=size)[0] if need_waveform \
else get_fbank(path, offset=offset, size=size)
return features_or_waveform
def get_features_or_waveform(path: str, need_waveform=False): def get_features_or_waveform(path: str, need_waveform=False):
"""Get speech features from .npy file or waveform from .wav/.flac file. """Get speech features from .npy file or waveform from .wav/.flac file.
The file may be inside an uncompressed ZIP file and is accessed via byte The file may be inside an uncompressed ZIP file and is accessed via byte
...@@ -205,9 +213,14 @@ def get_features_or_waveform(path: str, need_waveform=False): ...@@ -205,9 +213,14 @@ def get_features_or_waveform(path: str, need_waveform=False):
return get_features_from_npy_or_audio(_path) return get_features_from_npy_or_audio(_path)
elif len(extra) == 2: elif len(extra) == 2:
extra = [int(i) for i in extra] extra = [int(i) for i in extra]
features_or_waveform = get_features_or_waveform_from_uncompressed_zip( if _path.endswith('.zip'):
_path, extra[0], extra[1], need_waveform=need_waveform features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
) _path, extra[0], extra[1], need_waveform=need_waveform
)
else:
features_or_waveform = get_features_or_waveform_from_audio(
_path, extra[0], extra[1], need_waveform=need_waveform
)
else: else:
raise ValueError(f"Invalid path: {path}") raise ValueError(f"Invalid path: {path}")
......
...@@ -54,12 +54,12 @@ class DLCLTransformerModel(TransformerModel): ...@@ -54,12 +54,12 @@ class DLCLTransformerModel(TransformerModel):
TransformerModel.add_args(parser) TransformerModel.add_args(parser)
# dense layer parameters # dense layer parameters
parser.add_argument('--encoder-history-type', # parser.add_argument('--encoder-history-type',
default="learnable_dense", # default="learnable_dense",
help='encoder layer history type') # help='encoder layer history type')
parser.add_argument('--decoder-history-type', # parser.add_argument('--decoder-history-type',
default="learnable_dense", # default="learnable_dense",
help='decoder layer history type') # help='decoder layer history type')
parser.add_argument('--encoder-integration-type', choices=['avg', 'sum'], parser.add_argument('--encoder-integration-type', choices=['avg', 'sum'],
help='encoder layer integration type') help='encoder layer integration type')
parser.add_argument('--decoder-integration-type', choices=['avg', 'sum'], parser.add_argument('--decoder-integration-type', choices=['avg', 'sum'],
......
...@@ -463,7 +463,11 @@ class PDSS2TTransformerModel(S2TTransformerModel): ...@@ -463,7 +463,11 @@ class PDSS2TTransformerModel(S2TTransformerModel):
type=float, type=float,
help="dropout in each stage", help="dropout in each stage",
) )
parser.add_argument(
"--ctc-layer",
type=int,
help="the layer of ctc",
)
pass pass
@classmethod @classmethod
...@@ -516,8 +520,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -516,8 +520,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
else: else:
self.pds_attn_ds_ratios = None self.pds_attn_ds_ratios = None
self.fusion = getattr(args, "pds_fusion", False) self.pds_fusion = args.pds_fusion
self.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv") self.pds_fusion_method = args.pds_fusion_method
self.pds_fusion_transform = "conv" self.pds_fusion_transform = "conv"
if len(self.pds_fusion_method.split("_")) == 2: if len(self.pds_fusion_method.split("_")) == 2:
items = self.pds_fusion_method.split("_") items = self.pds_fusion_method.split("_")
...@@ -525,10 +530,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -525,10 +530,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.pds_fusion_transform = items[1] self.pds_fusion_transform = items[1]
fusion_stages_num = 0 fusion_stages_num = 0
if self.fusion: if self.pds_fusion:
if self.pds_fusion_way == "all": if self.pds_fusion_method == "all":
fusion_stages_num = self.pds_stages fusion_stages_num = self.pds_stages
elif self.pds_fusion_way == "same": elif self.pds_fusion_method == "same":
for dim in self.pds_embed_dims: for dim in self.pds_embed_dims:
if dim == self.embed_dim: if dim == self.embed_dim:
fusion_stages_num += 1 fusion_stages_num += 1
...@@ -555,7 +560,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -555,7 +560,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
"fusion {}, fusion method {}, fusion transformer {}.". "fusion {}, fusion method {}, fusion transformer {}.".
format(i, num_layers, ds_ratio, embed_dim, format(i, num_layers, ds_ratio, embed_dim,
kernel_size, use_pos_embed, ffn_ratio, num_head, kernel_size, use_pos_embed, ffn_ratio, num_head,
self.fusion, self.pds_fusion_method, self.pds_fusion_transform)) self.pds_fusion, self.pds_fusion_method, self.pds_fusion_transform))
if i == 0: if i == 0:
self.embed_scale = math.sqrt(embed_dim) self.embed_scale = math.sqrt(embed_dim)
...@@ -588,7 +593,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -588,7 +593,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.pds_fusion_method == "same" and self.embed_dim == embed_dim self.pds_fusion_method == "same" and self.embed_dim == embed_dim
): ):
if i != self.pds_stages - 1: if i != self.pds_stages - 1:
ratio = reduce(lambda a, b: a * b, self.pds_sr_ratios[i + 1:]) ratio = reduce(lambda a, b: a * b, self.pds_ratios[i + 1:])
else: else:
ratio = 1 ratio = 1
...@@ -636,8 +641,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -636,8 +641,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
(("ctc" in getattr(args, "criterion", False)) and (("ctc" in getattr(args, "criterion", False)) and
(getattr(args, "ctc_weight", False) > 0)) (getattr(args, "ctc_weight", False) > 0))
if self.use_ctc: if self.use_ctc:
self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers # self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False # self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
self.ctc_layer = args.encoder_layers
self.inter_ctc = True if self.ctc_layer != 0 else False
if self.inter_ctc: if self.inter_ctc:
logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer) logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)
...@@ -655,7 +662,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -655,7 +662,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
dropout=args.dropout, dropout=args.dropout,
need_layernorm=True if self.inter_ctc else False) need_layernorm=True if self.inter_ctc else False)
if task.source_dictionary == task.target_dictionary: if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
self.ctc.ctc_projection.weight = embed_tokens.weight self.ctc.ctc_projection.weight = embed_tokens.weight
if args.encoder_normalize_before: if args.encoder_normalize_before:
...@@ -747,7 +754,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -747,7 +754,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.add_to_dict(x, dis, cos_sim_idx) self.add_to_dict(x, dis, cos_sim_idx)
if self.use_ctc and self.inter_ctc and self.ctc_layer == layer_idx: if self.use_ctc and self.inter_ctc and self.ctc_layer == layer_idx:
ctc_logit = self.CTC(x) ctc_logit = self.ctc(x.clone())
prev_state.append(x) prev_state.append(x)
prev_padding.append(encoder_padding_mask) prev_padding.append(encoder_padding_mask)
...@@ -787,7 +794,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -787,7 +794,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
return { return {
"encoder_out": [x], # T x B x C "encoder_out": [x], # T x B x C
"ctc_logit": [ctc_logit], # T x B x C "ctc_logit": [] if ctc_logit is None else [ctc_logit], # T x B x C
"encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_padding_mask": [encoder_padding_mask], # B x T
"encoder_embedding": [], # B x T x C "encoder_embedding": [], # B x T x C
"encoder_states": [], # List[T x B x C] "encoder_states": [], # List[T x B x C]
...@@ -800,10 +807,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -800,10 +807,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
[] if len(encoder_out["encoder_out"]) == 0 [] if len(encoder_out["encoder_out"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]] else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
) )
new_ctc_logit = ( new_ctc_logit = (
[] if len(encoder_out["ctc_logit"]) == 0 [] if len(encoder_out["ctc_logit"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]] else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
) )
new_encoder_padding_mask = ( new_encoder_padding_mask = (
...@@ -823,7 +829,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder): ...@@ -823,7 +829,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
return { return {
"encoder_out": new_encoder_out, # T x B x C "encoder_out": new_encoder_out, # T x B x C
"ctc_logit": [new_ctc_logit], # T x B x C "ctc_logit": new_ctc_logit, # T x B x C
"encoder_padding_mask": new_encoder_padding_mask, # B x T "encoder_padding_mask": new_encoder_padding_mask, # B x T
"encoder_embedding": new_encoder_embedding, # B x T x C "encoder_embedding": new_encoder_embedding, # B x T x C
"encoder_states": encoder_states, # List[T x B x C] "encoder_states": encoder_states, # List[T x B x C]
...@@ -901,8 +907,8 @@ def base_architecture(args): ...@@ -901,8 +907,8 @@ def base_architecture(args):
args.ctc_layer = getattr(args, "ctc_layer", 0) args.ctc_layer = getattr(args, "ctc_layer", 0)
args.pds_dropout = getattr(args, "pds_dropout", args.dropout) args.pds_dropout = getattr(args, "pds_dropout", args.dropout)
args.fusion = getattr(args, "fusion", False) args.pds_fusion = getattr(args, "pds_fusion", False)
args.fusion_method = getattr(args, "fusion_method", "all_conv") args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
def set_pds_base_8(args): def set_pds_base_8(args):
......
...@@ -130,8 +130,8 @@ class S2TSATEModel(S2TTransformerModel): ...@@ -130,8 +130,8 @@ class S2TSATEModel(S2TTransformerModel):
component=encoder.text_encoder, checkpoint=args.load_pretrained_text_encoder_from, strict=False component=encoder.text_encoder, checkpoint=args.load_pretrained_text_encoder_from, strict=False
) )
if args.share_ctc_and_adapter and hasattr(encoder.adapter, "linear_adapter"): if args.share_ctc_and_adapter and hasattr(encoder.adapter, "embed_adapter"):
encoder.acoustic_encoder.ctc_projection.weight = encoder.adapter.linear_adapter[0].weight encoder.acoustic_encoder.ctc.ctc_projection.weight = encoder.adapter.embed_adapter.weight
return encoder return encoder
...@@ -175,10 +175,7 @@ class Adapter(nn.Module): ...@@ -175,10 +175,7 @@ class Adapter(nn.Module):
self.gate_linear1 = nn.Linear(embed_dim, embed_dim) self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
self.gate_linear2 = nn.Linear(embed_dim, embed_dim) self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
self.out_layernorm = LayerNorm(embed_dim) def forward(self, x, padding):
# self.out_layernorm = nn.Identity()
def forward(self, x, padding):
representation, distribution = x representation, distribution = x
batch, seq_len, embed_dim = representation.size() batch, seq_len, embed_dim = representation.size()
...@@ -188,30 +185,25 @@ class Adapter(nn.Module): ...@@ -188,30 +185,25 @@ class Adapter(nn.Module):
if self.adapter_type == "linear": if self.adapter_type == "linear":
out = self.linear_adapter(representation) out = self.linear_adapter(representation)
out = self.out_layernorm(out)
elif self.adapter_type == "context": elif self.adapter_type == "context":
out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
out = self.out_layernorm(out)
elif self.adapter_type == "subsample": elif self.adapter_type == "subsample":
representation = representation.transpose(0, 1) representation = representation.transpose(0, 1)
out, input_lengths = self.subsample_adaptor(representation, lengths) out, input_lengths = self.subsample_adaptor(representation, lengths)
padding = lengths_to_padding_mask(input_lengths) padding = lengths_to_padding_mask(input_lengths)
out = self.out_layernorm(out)
elif self.adapter_type == "league": elif self.adapter_type == "league":
linear_out = self.linear_adapter(representation) linear_out = self.linear_adapter(representation)
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
out = linear_out + soft_out out = linear_out + soft_out
out = self.out_layernorm(out)
elif self.adapter_type == "gated_league": elif self.adapter_type == "gated_league":
linear_out = self.linear_adapter(representation) linear_out = self.linear_adapter(representation)
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1) soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid() coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
out = coef * linear_out + (1 - coef) * soft_out out = coef * linear_out + (1 - coef) * soft_out
out = self.out_layernorm(out)
elif self.adapter_type == "none": elif self.adapter_type == "none":
out = representation out = representation
...@@ -223,10 +215,10 @@ class Adapter(nn.Module): ...@@ -223,10 +215,10 @@ class Adapter(nn.Module):
return out, padding return out, padding
class TextEncoder(nn.Module): class TextEncoder(FairseqEncoder):
def __init__(self, args, dictionary): def __init__(self, args, dictionary):
super().__init__() super().__init__(None)
self.embed_tokens = None self.embed_tokens = None
...@@ -285,9 +277,9 @@ class S2TSATEEncoder(FairseqEncoder): ...@@ -285,9 +277,9 @@ class S2TSATEEncoder(FairseqEncoder):
# acoustic encoder # acoustic encoder
acoustic_encoder_type = args.acoustic_encoder acoustic_encoder_type = args.acoustic_encoder
if acoustic_encoder_type == "transformer": if acoustic_encoder_type == "transformer":
self.acoustic_encoder = S2TTransformerEncoder(args, task, embed_tokens) self.acoustic_encoder = S2TTransformerEncoder(args, task)
elif acoustic_encoder_type == "pds": elif acoustic_encoder_type == "pds":
self.acoustic_encoder = PDSS2TTransformerEncoder(args, task, embed_tokens) self.acoustic_encoder = PDSS2TTransformerEncoder(args, task)
else: else:
logging.error("Unsupported model arch {}!".format(acoustic_encoder_type)) logging.error("Unsupported model arch {}!".format(acoustic_encoder_type))
...@@ -295,8 +287,8 @@ class S2TSATEEncoder(FairseqEncoder): ...@@ -295,8 +287,8 @@ class S2TSATEEncoder(FairseqEncoder):
self.temperature = args.temperature self.temperature = args.temperature
self.adapter = Adapter(args, task.source_dictionary, embed_tokens) self.adapter = Adapter(args, task.source_dictionary, embed_tokens)
if args.share_ctc_and_adapter and hasattr(self.adapter, "linear_adapter"): if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
self.acoustic_encoder.ctc_projection.weight = self.adapter.linear_adapter[0].weight self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight
# self.length_adapter = Conv1dSubsampler( # self.length_adapter = Conv1dSubsampler(
# args.encoder_embed_dim, # args.encoder_embed_dim,
...@@ -463,6 +455,7 @@ def base_architecture(args): ...@@ -463,6 +455,7 @@ def base_architecture(args):
args.temperature = getattr(args, "temperature", 1.0) args.temperature = getattr(args, "temperature", 1.0)
args.text_encoder_layers = getattr(args, "text_encoder_layers", 6) args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
args.text_attention_type = getattr(args, "text_attention_type", "selfattn") args.text_attention_type = getattr(args, "text_attention_type", "selfattn")
args.share_ctc_and_adapter = getattr(args, "share_ctc_and_adapter", False)
# PDS # PDS
args.pds_stages = getattr(args, "pds_stages", None) args.pds_stages = getattr(args, "pds_stages", None)
......
...@@ -511,8 +511,8 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -511,8 +511,8 @@ class S2TTransformerEncoder(FairseqEncoder):
self.use_ctc = "sate" in args.arch or \ self.use_ctc = "sate" in args.arch or \
(("ctc" in getattr(args, "criterion", "")) and (getattr(args, "ctc_weight", 0) > 0)) (("ctc" in getattr(args, "criterion", "")) and (getattr(args, "ctc_weight", 0) > 0))
if self.use_ctc: if self.use_ctc:
self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers self.ctc_layer = args.ctc_layer
self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False self.inter_ctc = True if self.ctc_layer != 0 and self.ctc_layer != args.encoder_layers else False
if self.inter_ctc: if self.inter_ctc:
logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer) logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)
self.ctc = CTC(args.encoder_embed_dim, self.ctc = CTC(args.encoder_embed_dim,
...@@ -520,7 +520,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -520,7 +520,7 @@ class S2TTransformerEncoder(FairseqEncoder):
dropout=args.dropout, dropout=args.dropout,
need_layernorm=True if self.inter_ctc else False) need_layernorm=True if self.inter_ctc else False)
if task.source_dictionary == task.target_dictionary: if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
self.ctc.ctc_projection.weight = embed_tokens.weight self.ctc.ctc_projection.weight = embed_tokens.weight
self.interleaved_dropout = getattr(args, "interleave_dropout", None) self.interleaved_dropout = getattr(args, "interleave_dropout", None)
...@@ -554,7 +554,6 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -554,7 +554,6 @@ class S2TTransformerEncoder(FairseqEncoder):
def forward(self, src_tokens, src_lengths): def forward(self, src_tokens, src_lengths):
ctc_input = None
if self.history is not None: if self.history is not None:
self.history.clean() self.history.clean()
...@@ -632,7 +631,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -632,7 +631,7 @@ class S2TTransformerEncoder(FairseqEncoder):
return { return {
"encoder_out": [x], # T x B x C "encoder_out": [x], # T x B x C
"ctc_logit": [ctc_logit], # B x T x C "ctc_logit": [] if ctc_logit is None else [ctc_logit], # B x T x C
"encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_padding_mask": [encoder_padding_mask], # B x T
"encoder_embedding": [], # B x T x C "encoder_embedding": [], # B x T x C
"encoder_states": [], # List[T x B x C] "encoder_states": [], # List[T x B x C]
...@@ -648,7 +647,7 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -648,7 +647,7 @@ class S2TTransformerEncoder(FairseqEncoder):
new_ctc_logit = ( new_ctc_logit = (
[] if len(encoder_out["ctc_logit"]) == 0 [] if len(encoder_out["ctc_logit"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]] else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
) )
new_encoder_padding_mask = ( new_encoder_padding_mask = (
......
...@@ -218,6 +218,28 @@ class TransformerModel(FairseqEncoderDecoderModel): ...@@ -218,6 +218,28 @@ class TransformerModel(FairseqEncoderDecoderModel):
], ],
help="transformer decoder self-attention layer type" help="transformer decoder self-attention layer type"
) )
parser.add_argument(
"--use-enc-dlcl",
default=False,
action='store_true',
help="use dlcl encoder",
)
parser.add_argument(
"--use-dec-dlcl",
default=False,
action='store_true',
help="use dlcl encoder",
)
parser.add_argument(
'--encoder-history-type',
default="learnable_dense",
help='encoder layer history type'
)
parser.add_argument(
'--decoder-history-type',
default="learnable_dense",
help='decoder layer history type'
)
parser.add_argument('--max-encoder-relative-length', type=int, default=-1, parser.add_argument('--max-encoder-relative-length', type=int, default=-1,
help='the max encoder relative length') help='the max encoder relative length')
parser.add_argument('--max-decoder-relative-length', type=int, default=-1, parser.add_argument('--max-decoder-relative-length', type=int, default=-1,
...@@ -474,6 +496,11 @@ class TransformerEncoder(FairseqEncoder): ...@@ -474,6 +496,11 @@ class TransformerEncoder(FairseqEncoder):
else: else:
self.layer_norm = None self.layer_norm = None
if getattr(args, "use_enc_dlcl", False):
self.history = CreateLayerHistory(args, is_encoder=True)
else:
self.history = None
def build_encoder_layer(self, args): def build_encoder_layer(self, args):
layer = TransformerEncoderLayer(args) layer = TransformerEncoderLayer(args)
if getattr(args, "checkpoint_activations", False): if getattr(args, "checkpoint_activations", False):
...@@ -571,6 +598,9 @@ class TransformerEncoder(FairseqEncoder): ...@@ -571,6 +598,9 @@ class TransformerEncoder(FairseqEncoder):
encoder_padding_mask = src_tokens.eq(self.padding_idx) encoder_padding_mask = src_tokens.eq(self.padding_idx)
has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any()) has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any())
if self.history is not None:
self.history.clean()
x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings) x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
# account for padding while computing the representation # account for padding while computing the representation
...@@ -585,8 +615,15 @@ class TransformerEncoder(FairseqEncoder): ...@@ -585,8 +615,15 @@ class TransformerEncoder(FairseqEncoder):
if return_all_hiddens: if return_all_hiddens:
encoder_states.append(x) encoder_states.append(x)
# add emb into history
if self.history is not None:
self.history.add(x)
# encoder layers # encoder layers
for layer in self.layers: for layer in self.layers:
if self.history is not None:
x = self.history.pop()
x = layer( x = layer(
x, encoder_padding_mask=encoder_padding_mask if has_pads else None x, encoder_padding_mask=encoder_padding_mask if has_pads else None
) )
...@@ -594,6 +631,12 @@ class TransformerEncoder(FairseqEncoder): ...@@ -594,6 +631,12 @@ class TransformerEncoder(FairseqEncoder):
assert encoder_states is not None assert encoder_states is not None
encoder_states.append(x) encoder_states.append(x)
if self.history is not None:
self.history.add(x)
if self.history is not None:
x = self.history.pop()
if self.layer_norm is not None: if self.layer_norm is not None:
x = self.layer_norm(x) x = self.layer_norm(x)
......
...@@ -67,6 +67,9 @@ def main(cfg: FairseqConfig) -> None: ...@@ -67,6 +67,9 @@ def main(cfg: FairseqConfig) -> None:
# Print args # Print args
logger.info(cfg) logger.info(cfg)
with open(os.path.join(cfg.checkpoint.save_dir, "config.yaml"), 'w') as f:
f.write("%s" % OmegaConf.to_yaml(cfg))
if cfg.checkpoint.write_checkpoints_asynchronously: if cfg.checkpoint.write_checkpoints_asynchronously:
try: try:
import iopath # noqa: F401 import iopath # noqa: F401
......
...@@ -73,33 +73,38 @@ def average_checkpoints(inputs): ...@@ -73,33 +73,38 @@ def average_checkpoints(inputs):
return new_state return new_state
def last_n_checkpoints(paths, n, update_based, upper_bound=None): def last_n_checkpoints(paths, n, combine_choice, upper_bound=None, max_metric=False):
assert len(paths) == 1 assert len(paths) == 1
path = paths[0] path = paths[0]
if update_based: reverse = True
if combine_choice == "update":
pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt") pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
elif combine_choice == "best":
reverse = True if max_metric else False
pt_regexp = re.compile(r"checkpoint\.best_loss_\d+_(\d+\.?\d*)\.pt")
else: else:
pt_regexp = re.compile(r"checkpoint(\d+)\.pt") pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
files = PathManager.ls(path) files = PathManager.ls(path)
entries = [] entries = []
for f in files: for f in files:
m = pt_regexp.fullmatch(f) m = pt_regexp.fullmatch(f)
if m is not None: if m is not None:
sort_key = int(m.group(1)) sort_key = float(m.group(1))
if upper_bound is None or sort_key <= upper_bound: if upper_bound is None or sort_key <= upper_bound:
entries.append((sort_key, m.group(0))) entries.append((sort_key, m.group(0)))
if len(entries) < n: if len(entries) < n:
raise Exception( raise Exception(
"Found {} checkpoint files but need at least {}", len(entries), n "Found {} checkpoint files but need at least {}", len(entries), n
) )
return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]] return [os.path.join(path, x[1]) for x in sorted(entries, reverse=reverse)[:n]]
def main(): def main():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Tool to average the params of input checkpoints to " description="Tool to average the params of input checkpoints to "
"produce a new checkpoint", "produce a new checkpoint",
) )
# fmt: off # fmt: off
parser.add_argument('--inputs', required=True, nargs='+', parser.add_argument('--inputs', required=True, nargs='+',
...@@ -109,42 +114,55 @@ def main(): ...@@ -109,42 +114,55 @@ def main():
num_group = parser.add_mutually_exclusive_group() num_group = parser.add_mutually_exclusive_group()
num_group.add_argument('--num-epoch-checkpoints', type=int, num_group.add_argument('--num-epoch-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, ' help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
'and average last this many of them.') 'and average last this many of them.')
num_group.add_argument('--num-update-checkpoints', type=int, num_group.add_argument('--num-update-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, ' help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
'and average last this many of them.') 'and average last this many of them.')
num_group.add_argument('--num-best-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
'and average last this many of them.')
num_group.add_argument('--maximize-best-checkpoint-metric', default=False, action="store_true",
help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
'and average last this many of them.')
parser.add_argument('--checkpoint-upper-bound', type=int, parser.add_argument('--checkpoint-upper-bound', type=int,
help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, ' help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
'when using --num-update-checkpoints, this will set an upper bound on which update to use' 'when using --num-update-checkpoints, this will set an upper bound on which update to use'
'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.' 'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500' 'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500.'
'e.g., with --num-best-checkpoints=10 --checkpoint-upper-bound=5, checkpoints (loss) <= 5 would be averaged.'
) )
# fmt: on # fmt: on
args = parser.parse_args() args = parser.parse_args()
print(args) print(args)
num = None num = None
is_update_based = False combine_choice = "epoch"
max_metric = args.maximize_best_checkpoint_metric
if args.num_update_checkpoints is not None: if args.num_update_checkpoints is not None:
num = args.num_update_checkpoints num = args.num_update_checkpoints
is_update_based = True combine_choice = "update"
elif args.num_epoch_checkpoints is not None: elif args.num_epoch_checkpoints is not None:
num = args.num_epoch_checkpoints num = args.num_epoch_checkpoints
elif args.num_best_checkpoints is not None:
num = args.num_best_checkpoints
combine_choice = "best"
assert args.checkpoint_upper_bound is None or ( assert args.checkpoint_upper_bound is None or (
args.num_epoch_checkpoints is not None args.num_epoch_checkpoints is not None
or args.num_update_checkpoints is not None or args.num_update_checkpoints is not None
), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints" or args.num_best_checkpoints is not None
), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints or --num-best-checkpoints"
assert ( assert (
args.num_epoch_checkpoints is None or args.num_update_checkpoints is None args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints" ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
if num is not None: if num is not None:
args.inputs = last_n_checkpoints( args.inputs = last_n_checkpoints(
args.inputs, args.inputs,
num, num,
is_update_based, combine_choice,
upper_bound=args.checkpoint_upper_bound, upper_bound=args.checkpoint_upper_bound,
max_metric=max_metric
) )
print("averaging checkpoints: ", args.inputs) print("averaging checkpoints: ", args.inputs)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论