Commit da4e7dc3 by xuchen

fix the bugs

parent 093098e4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
......
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3
post-process: sentencepiece
\ No newline at end of file
......@@ -15,6 +15,7 @@ report-accuracy: True
#load-pretrained-decoder-from:
arch: s2t_transformer_s
#arch: pdss2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
......
arch: pdss2t_transformer_s_8
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_8
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_8
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pds-stages: 5
ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_8
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
......@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1
fi
cer=1
n_average=10
beam_size=5
len_penalty=1.0
......@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--cer ${cer}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
......
......@@ -24,31 +24,31 @@ stop_stage=0
gpu_num=0
update_freq=1
s2t_dir=~/Code/st
root_dir=${s2t_dir}/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=fr
lang=${src_lang}-${tgt_lang}
src_lang=zh
lang=${src_lang}
dataset=libri_trans
dataset=aishell
task=speech_to_text
vocab_type=unigram
vocab_size=1000
speed_perturb=0
lcrm=1
vocab_type=char
vocab_size=5000
speed_perturb=1
lcrm=0
tokenizer=0
use_raw_audio=1
use_raw_audio=0
use_specific_dict=0
specific_prefix=st
specific_dir=${s2t_dir}/data/mustc/st/en-de
specific_dir=${root_dir}/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${s2t_dir}/data/${dataset}
data_dir=${s2t_dir}/data/${dataset}/asr
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=test
......@@ -71,6 +71,7 @@ max_tokens=40000
step_valid=0
# decoding setting
cer=1
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
......@@ -96,6 +97,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
if [[ "${vocab_type}" == "char" ]]; then
data_dir=${data_dir}_char
exp_prefix=${exp_prefix}_char
fi
. ./local/parse_options.sh || exit 1;
......@@ -106,7 +111,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
......@@ -120,13 +125,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_audio_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--splits ${train_split},${valid_split},${test_split}
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
......@@ -135,7 +146,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
......@@ -155,6 +166,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
......@@ -181,28 +201,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
......@@ -286,12 +304,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
......@@ -311,8 +331,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}_asr
cmd="python ${root_dir}/fairseq_cli/generate.py
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
......@@ -323,10 +343,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
"
if [[ ${cer} -eq 1 ]]; then
cmd="${cmd}
--wer-char-level"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
......
set -e
eval=1
lcrm=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
......@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
dropout: 0.3
attention-dropout: 0.0
activation-dropout: 0.0
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-ffn-embed-dim: 1024
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer_iwslt_de_en
share-decoder-input-output-embed: True
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
weight-decay: 0.0001
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 5e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 512
encoder-ffn-embed-dim: 1024
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
......@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
dropout: 0.3
attention-dropout: 0.0
activation-dropout: 0.0
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-embed-dim: 512
encoder-ffn-embed-dim: 1024
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-embed-dim: 512
decoder-ffn-embed-dim: 1024
decoder-attention-heads: 4
load-pretrained-encoder-from:
load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 50000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
......@@ -2,4 +2,4 @@
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
\ No newline at end of file
max-decoder-relative-length: 20
......@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=0
n_average=10
beam_size=5
len_penalty=1.0
......@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
......
......@@ -13,7 +13,7 @@ set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
time=$(date "+%m%d")
stage=0
stop_stage=0
......@@ -24,33 +24,34 @@ device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
src_lang=de
tgt_lang=en
lang=${src_lang}-${tgt_lang}
dataset=mt
dataset=iwslt14
task=translation
vocab_type=unigram
vocab_size=10000
share_dict=1
lcrm=0
tokenizer=0
tokenizer=1
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/
specific_dir=${root_dir}/data/mustc/st/en-de/
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang}
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt
train_subset=train
valid_subset=dev
trans_subset=tst-COMMON
trans_subset=test
test_subset=test
# exp
......@@ -70,6 +71,7 @@ step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=0
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
......@@ -80,13 +82,19 @@ if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
data_dir=${data_dir}/${vocab_type}${vocab_size}
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
if [[ "${vocab_type}" == "char" ]]; then
vocab_name=${vocab_type}
exp_prefix=${exp_prefix}_${vocab_type}
else
vocab_name=${vocab_type}${vocab_size}
fi
data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
src_vocab_prefix=spm_${vocab_name}_share
tgt_vocab_prefix=spm_${vocab_name}_share
fi
fi
if [[ ${lcrm} -eq 1 ]]; then
......@@ -111,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
model_dir=$code_dir/../checkpoints/$dataset/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
......@@ -127,7 +135,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset}
......@@ -150,9 +158,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt
else
txt_dir=${org_data_dir}/data/${split}
fi
cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
......@@ -165,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${tgt_lang}
< ${txt_dir}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......@@ -174,7 +187,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done
wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py
cmd="python ${code_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset}
......@@ -214,28 +227,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
......@@ -263,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
......@@ -292,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
......@@ -329,9 +332,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
......@@ -354,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
......@@ -365,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--post-process sentencepiece
--scoring sacrebleu"
--post-process sentencepiece"
if [[ ${tokenizer} -eq 1 ]]; then
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
set -e
eval=1
lcrm=1
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/test
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
splits=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g')
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task st
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--st-spm-prefix ${st_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
ctc-weight: 0.3
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -eq 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml
else
data_config=config_st.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#config_list=(pds_base)
#config_list=(pds_base conformer)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
......@@ -39,4 +23,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
patience: 20
best-checkpoint-metric: loss
maximize-best-checkpoint-metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 20
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
......
......@@ -2,19 +2,19 @@
#arch: s2t_transformer_s
arch: s2t_sate
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
pds-stages: 4
#pds-dropout: 0
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
cl-dropout: True
cl-dropout-epoch: 50
......
train-subset: train-clean-100
valid-subset: dev-clean
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_ctc_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_ctc_conformer_lr0.001/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_pyramid4_all256_3333_sr8_ctc/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1114_st_pyramid4_all256_ctc_fix/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1015_st_pyramid4_all256_conformer_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_pyramid4_all256_conformer_ctc/avg_10_checkpoint.pt
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: transformer
#acoustic-encoder: conformer
acoustic-encoder: pyramid
adapter: league
#adapter: none
#adapter: context
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-sr-ratios: 2_2_1_2
pyramid-embed-dims: 256_256_256_256
pyramid-fuse: True
pyramid-reduced-embed: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-heads: 4_4_4_4
\ No newline at end of file
arch: pdss2t_transformer_s_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
......@@ -37,4 +20,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 5
#pds-dropout: 0
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -52,4 +36,4 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
......@@ -2,22 +2,6 @@ arch: pdss2t_transformer_m_8
#arch: pdss2t_transformer_m_16
#arch: pdss2t_transformer_m_32
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
......
arch: pdss2t_transformer_m_16
encoder-embed-dim: 512
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 4_4_4_4
pyramid-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_m_32
encoder-embed-dim: 512
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512_512
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 4_4_4_4_4
pyramid-attn-heads: 8_8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 5
#pds-dropout: 0
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 4_4_4_4_4
pds-attn-heads: 8_8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_m_8
encoder-embed-dim: 512
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 512_512_512_512
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 4_4_4_4
pyramid-attn-heads: 8_8_8_8
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 512_512_512_512
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 4_4_4_4
pds-attn-heads: 8_8_8_8
share-decoder-input-output-embed: True
optimizer: adam
......
......@@ -2,22 +2,6 @@ arch: pdss2t_transformer_sd_8
#arch: pdss2t_transformer_sd_16
#arch: pdss2t_transformer_sd_32
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
......
arch: pdss2t_transformer_sd_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 5_5_12_8
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 5_5_12_8
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_sd_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 5_5_7_7_6
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 5
#pds-dropout: 0
pds-layers: 5_5_7_7_6
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_sd_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 7_7_7_9
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train-clean-100,train-clean-360,train-other-500
valid-subset: dev-clean
max-epoch: 100
max-update: 300000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
#pds-dropout: 0
pds-layers: 7_7_7_9
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
......@@ -24,7 +24,8 @@ device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
......@@ -42,8 +43,8 @@ specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}
test_subset=dev-clean,dev-other,test-clean,test-other
# exp
......@@ -81,13 +82,12 @@ fi
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
......@@ -103,7 +103,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_librispeech_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_librispeech_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--vocab-type ${vocab_type}
......@@ -146,28 +146,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
......@@ -252,9 +250,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
dec_model=avg_${n_average}_checkpoint.pt
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${root_dir}/scripts/average_checkpoints.py
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
......@@ -279,7 +277,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python ${root_dir}/fairseq_cli/generate.py
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
......
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
......
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3
post-process: sentencepiece
arch: pdss2t_transformer_s_8
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 5
ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......
......@@ -24,7 +24,8 @@ stop_stage=0
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
......@@ -41,13 +42,16 @@ lcrm=0
tokenizer=0
use_raw_audio=0
use_specific_dict=0
use_specific_dict=1
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/asr
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON
# exp
......@@ -59,7 +63,7 @@ exp_name=
# config
train_config=ctc
data_config=config_asr.yaml
data_config=config.yaml
# training setting
fp16=1
......@@ -97,13 +101,12 @@ fi
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
......@@ -114,11 +117,23 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
feature_zip=fbank80.zip
if [[ ${speed_perturb} -eq 1 ]]; then
feature_zip=fbank80_sp.zip
fi
if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
ln -s ${data_dir}/../feature_zip ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
......@@ -127,7 +142,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
......@@ -147,6 +162,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
mv ${data_dir}/${feature_zip} ${data_dir}/..
ln -s ${data_dir}/../${feature_zip} ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
......@@ -173,28 +193,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
......@@ -278,12 +296,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
......@@ -303,8 +323,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}_asr
cmd="python ${root_dir}/fairseq_cli/generate.py
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
......@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
n_average=10
beam_size=5
len_penalty=1.0
......@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
......
......@@ -13,7 +13,7 @@ set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
time=$(date "+%m%d")
stage=0
stop_stage=0
......@@ -24,7 +24,8 @@ device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
......@@ -42,12 +43,12 @@ tokenizer=0
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/
specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang}
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt
train_subset=train
valid_subset=dev
trans_subset=tst-COMMON
......@@ -70,6 +71,7 @@ step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
......@@ -106,7 +108,6 @@ fi
# full path
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
......@@ -128,7 +129,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset}
......@@ -151,9 +152,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
txt_dir=${org_data_dir}/data/${split}/txt
cmd="cat ${txt_dir}/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
......@@ -166,7 +168,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}/txt/${split}.${tgt_lang}
< ${txt_dir}/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......@@ -175,7 +177,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
done
wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py
cmd="python ${code_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset}
......@@ -215,28 +217,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
......@@ -330,12 +330,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
......@@ -355,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
......@@ -366,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--post-process sentencepiece
--scoring sacrebleu"
--post-process sentencepiece"
if [[ ${tokenizer} -eq 1 ]]; then
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
......@@ -42,3 +26,6 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: dev
max-epoch: 100
max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
ctc-weight: 0.3
\ No newline at end of file
ctc-weight: 0.3
post-process: sentencepiece
\ No newline at end of file
arch: pdss2t_transformer_s_8
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-fusion: True
ctc-layer: 12
share-decoder-input-output-embed: True
optimizer: adam
......@@ -38,3 +24,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
ctc-layer: 12
pds-layers: 2_2_6_2
pds-ratios: 2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 5
ctc-layer: 12
pds-layers: 2_2_3_3_2
pds-ratios: 2_2_2_2_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1_1
pds-kernel-sizes: 5_5_5_5_5
pds-ffn-ratios: 8_8_8_8_8
pds-attn-heads: 4_4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
share-decoder-input-output-embed: True
optimizer: adam
......@@ -53,3 +37,6 @@ encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
......@@ -43,6 +25,11 @@ text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
......@@ -52,20 +39,20 @@ acoustic-encoder: transformer
adapter: league
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
pds-stages: 4
#pds-dropout: 0
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 4
\ No newline at end of file
......@@ -3,13 +3,14 @@
gpu_num=1
data_dir=
test_subset=(tst-COMMON)
test_subset=(dev tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=1
n_average=10
beam_size=5
len_penalty=1.0
......@@ -21,6 +22,7 @@ cmd="./run.sh
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
......@@ -31,7 +33,7 @@ cmd="./run.sh
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -eq 0 ]]; then
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
......
......@@ -24,7 +24,8 @@ stop_stage=0
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
root_dir=~/st
code_dir=${root_dir}/Fairseq-S2T
pwd_dir=$PWD
# dataset
......@@ -41,15 +42,19 @@ share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_raw_audio=0
use_specific_dict=0
specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
specific_dir=${root_dir}/data/mustc/st
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/st
train_split=train
valid_split=dev
test_split=tst-COMMON
test_subset=tst-COMMON
# exp
......@@ -60,7 +65,7 @@ exp_tag=baseline
exp_name=
# config
train_config=ctc
train_config=base,ctc
# training setting
fp16=1
......@@ -69,15 +74,16 @@ step_valid=0
bleu_valid=0
# decoding setting
sacrebleu=1
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml
data_config=config_share.yaml
else
data_config=config_st.yaml
data_config=config.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
......@@ -95,18 +101,21 @@ if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
# exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
......@@ -117,37 +126,49 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang}
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
# create ASR vocabulary if necessary
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--output-root ${data_dir}/asr4st
--task asr
--raw
--src-lang ${src_lang}
--splits ${valid_split},${test_split},${train_split}
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--splits ${valid_split},${test_split},${train_split}
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_raw_audio} -eq 1 ]]; then
cmd="$cmd
--raw"
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
......@@ -182,9 +203,16 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
mv ${data_dir}/fbank80.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80.zip ${data_dir}
fi
if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
fi
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
......@@ -210,28 +238,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
extra_parameter="${extra_parameter}
--train-config ${pwd_dir}/conf/basis.yaml"
cp ${pwd_dir}/conf/basis.yaml ${model_dir}
config_list="${train_config//,/ }"
idx=0
idx=1
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
config_path=${pwd_dir}/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
cmd="python3 -u ${code_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
......@@ -324,12 +350,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
if [[ ! -f ${model_dir}/${dec_model} ]]; then
cmd="python ${code_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-best-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
fi
else
dec_model=${dec_model}
fi
......@@ -348,9 +376,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
......@@ -359,8 +387,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--lenpen ${len_penalty}"
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
......
set -e
eval=1
lcrm=0
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=~/st/data/test
vocab_dir=~/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
subsets=(2019)
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
splits=$(echo ${subsets[*]} | sed 's/ /,/g')
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task asr
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_asr
valid-subset: dev_asr
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#max-encoder-relative-length: 100
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing ASR Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
lang=${src_lang}
dataset=asr
task=speech_to_text
vocab_type=unigram
vocab_size=5000
speed_perturb=0
lcrm=1
tokenizer=0
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de
asr_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=test
test_subset=test
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
data_config=config_asr.yaml
# training setting
fp16=1
max_tokens=40000
step_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--splits ${train_split},${valid_split},${test_split}
--lang ${lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cmd="$cmd
--asr-prefix ${asr_vocab_prefix}"
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ASR Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ASR Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}_asr
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring wer
--wer-tokenizer 13a
--wer-lowercase
--wer-remove-punct
"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
#config_list=(ctc)
#config_list=(base conformer)
#config_list=(pds_base_16)
config_list=(pds_base_16 conformer rpr)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/wmt/test
vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
src_vocab_prefix=spm_unigram32000_share
tgt_vocab_prefix=spm_unigram32000_share
src_lang=en
tgt_lang=de
tokenize=1
splits=(newstest2014 newstest2016)
for split in ${splits[@]}; do
src_file=${data_dir}/${split}.${src_lang}
tgt_file=${data_dir}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.tok
tgt_file=${tgt_file}.tok
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${data_dir}/final
cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${data_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${data_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
use-enc-dlcl: True
use-dec-dlcl: True
#encoder-attention-type: rel_selfattn
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 20
max-decoder-relative-length: 20
\ No newline at end of file
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
fi
echo $cmd
eval $cmd
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
line = line.replace(" ", "")
print(line)
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=mt
task=translation
vocab_type=unigram
vocab_size=10000
share_dict=1
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=st
specific_dir=/home/xuchen/st/data/mustc/st/en-de/
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang}
train_subset=train
valid_subset=dev
trans_subset=tst-COMMON
test_subset=test
# exp
exp_prefix=${time}
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=base_s
# training setting
fp16=1
max_tokens=4096
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${specific_prefix}_${exp_prefix}
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
data_dir=${data_dir}/${vocab_type}${vocab_size}
src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
fi
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${tokenizer} -eq 1 ]]; then
train_subset=${train_subset}.tok
valid_subset=${valid_subset}.tok
trans_subset=${trans_subset}.tok
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
# full path
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
echo "stage 0: MT Data Preparation"
if [[ ! -e ${data_dir} ]]; then
mkdir -p ${data_dir}
fi
if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
if [[ ${use_specific_dict} -eq 0 ]]; then
cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--splits ${train_subset},${valid_subset},${trans_subset}
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
else
cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
fi
fi
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${tgt_lang}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
}&
done
wait
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang} --target-lang ${tgt_lang}
--trainpref ${data_dir}/data/${train_subset}
--validpref ${data_dir}/data/${valid_subset}
--testpref ${data_dir}/data/${trans_subset}
--destdir ${data_dir}/data-bin
--srcdict ${data_dir}/${src_vocab_prefix}.txt
--tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/data-bin
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: MT Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--gen-subset ${subset}
--task ${task}
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--post-process sentencepiece
--scoring sacrebleu"
if [[ ${tokenizer} -eq 1 ]]; then
cmd="${cmd}
--tokenizer moses
--moses-source-lang ${src_lang}
--moses-target-lang ${tgt_lang}"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=1
update_freq=1
max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
set -e
eval=1
lcrm=1
tokenizer=0
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/test
vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
src_lang=en
tgt_lang=de
splits=(2019)
splits=$(echo ${splits[*]} | sed 's/ /_/g')
cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${data_dir}
--output-root ${data_dir}
--splits ${splits}
--task st
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--add-src
--share
--asr-prefix ${asr_vocab_prefix}
--st-spm-prefix ${st_vocab_prefix}
--cmvn-type utterance"
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_transformer_s
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
attention-dropout: 0.1
activation-dropout: 0.1
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
ctc-weight: 0.3
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
arch: pdss2t_transformer_s_16
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 2_2_6_2
pyramid-ratios: 2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_32
encoder-embed-dim: 256
pyramid-stages: 5
#pyramid-dropout: 0
pyramid-layers: 2_2_3_3_2
pyramid-ratios: 2_2_2_2_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1_1
pyramid-kernel-sizes: 5_5_5_5_5
pyramid-ffn-ratios: 8_8_8_8_8
pyramid-attn-heads: 4_4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
arch: pdss2t_transformer_s_8
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
train-subset: train_st
valid-subset: dev_st
max-epoch: 100
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
ctc-weight: 0.3
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
#macaron-style: True
#use-cnn-module: True
#cnn-module-kernel: 31
#acoustic-encoder: pds
acoustic-encoder: transformer
adapter: league
encoder-embed-dim: 256
pyramid-stages: 4
#pyramid-dropout: 0
pyramid-layers: 3_3_3_3
pyramid-ratios: 2_2_1_2
pyramid-fusion: True
pyramid-fusion-method: all_conv
pyramid-embed-dims: 256_256_256_256
pyramid-ds-method: conv
pyramid-embed-norm: True
pyramid-position-embed: 1_1_1_1
pyramid-kernel-sizes: 5_5_5_5
pyramid-ffn-ratios: 8_8_8_8
pyramid-attn-heads: 4_4_4_4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#! /bin/bash
gpu_num=1
data_dir=
test_subset=(tst-COMMON)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
n_average=10
beam_size=5
len_penalty=1.0
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ ${#test_subset[@]} -eq 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
eval $cmd
set -e
gpu_num=1
root_dir=/home/xuchen/st/Fairseq-S2T
ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
model_txt=$1
set=$2
test_subset=$3
#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
#test_subset=(tst-COMMON)
data_dir=/media/data/tst/$set/en-de
#test_subset=(office)
#test_subset=(webrtc1)
#test_subset=(adap2)
data_config=config_st_share.yaml
result_file=./result
beam_size=5
lenpen=0.6
max_tokens=10000
models=()
i=0
for line in `cat $model_txt`; do
i=`expr $i + 1`
model_dir=$ckpt/$line
[[ ! -d $model_dir ]] && echo $model_dir && exit 1;
if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
model=$model_dir/avg_10_checkpoint.pt
else
model=$model_dir/checkpoint_best.pt
fi
[[ ! -f $model ]] && echo $model && exit 1;
models[$i]=$model
done
models=`echo ${models[*]} | sed 's/ /:/g'`
res_dir=$ckpt/ensemble/$set
i=0
while :
do
if [[ -d $res_dir/$i ]]; then
i=`expr $i + 1`
else
res_dir=$res_dir/$i
break
fi
done
mkdir -p $res_dir
cp $model_txt $res_dir
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=()
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
for subset in ${test_subset[@]}; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${models}
--results-path ${res_dir}
--skip-invalid-size-inputs-valid-test
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${lenpen}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
eval $cmd
tail -n 1 ${res_dir}/generate-${subset}.txt
cd $res_dir
evaluate.sh translation-${subset}.txt $set
cd -
done
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#! /bin/bash
# Processing MuST-C Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
#set -u
set -o pipefail
export PYTHONIOENCODING=UTF-8
eval=1
time=$(date "+%m%d_%H%M")
stage=0
stop_stage=0
######## hardware ########
# devices
#device=()
gpu_num=8
update_freq=1
root_dir=~/st/Fairseq-S2T
pwd_dir=$PWD
# dataset
src_lang=en
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=st
task=speech_to_text
vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=0
lcrm=0
tokenizer=0
use_specific_dict=0
specific_prefix=valid
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
asr_vocab_prefix=spm_unigram10000_st_share
st_vocab_prefix=spm_unigram10000_st_share
org_data_dir=~/st/data/${dataset}
data_dir=~/st/data/${dataset}/st
test_subset=tst-COMMON
# exp
exp_prefix=$(date "+%m%d")
extra_tag=
extra_parameter=
exp_tag=baseline
exp_name=
# config
train_config=ctc
# training setting
fp16=1
max_tokens=40000
step_valid=0
bleu_valid=0
# decoding setting
dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
if [[ ${share_dict} -eq 1 ]]; then
data_config=config_st_share.yaml
else
data_config=config_st.yaml
fi
if [[ ${speed_perturb} -eq 1 ]]; then
data_dir=${data_dir}_sp
exp_prefix=${exp_prefix}_sp
fi
if [[ ${lcrm} -eq 1 ]]; then
data_dir=${data_dir}_lcrm
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${use_specific_dict} -eq 1 ]]; then
data_dir=${data_dir}_${specific_prefix}
exp_prefix=${exp_prefix}_${specific_prefix}
fi
if [[ ${tokenizer} -eq 1 ]]; then
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
. ./local/parse_options.sh || exit 1;
if [[ -z ${exp_name} ]]; then
config_string=${train_config//,/_}
exp_name=${exp_prefix}_${config_string}_${exp_tag}
if [[ -n ${extra_tag} ]]; then
exp_name=${exp_name}_${extra_tag}
fi
fi
model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "stage -1: Data Download"
# pass
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
### Task dependent. You have to make data the following preparation part by yourself.
### But you can utilize Kaldi recipes in most cases
echo "stage 0: ASR Data Preparation"
if [[ ! -e ${data_dir}/${lang} ]]; then
mkdir -p ${data_dir}/${lang}
fi
cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--vocab-type ${vocab_type}
--vocab-size ${asr_vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
echo "stage 0: ST Data Preparation"
cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
--data-root ${org_data_dir}
--output-root ${data_dir}
--task st
--add-src
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${use_specific_dict} -eq 1 ]]; then
cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share
--st-spm-prefix ${st_vocab_prefix}"
else
cmd="$cmd
--st-spm-prefix ${st_vocab_prefix}
--asr-prefix ${asr_vocab_prefix}"
fi
else
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
else
cmd="$cmd
--asr-prefix ${asr_prefix}"
fi
fi
if [[ ${speed_perturb} -eq 1 ]]; then
cmd="$cmd
--speed-perturb"
fi
if [[ ${lcrm} -eq 1 ]]; then
cmd="$cmd
--lowercase-src
--rm-punc-src"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
data_dir=${data_dir}/${lang}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: ST Network Training"
[[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
echo -e "dev=${device} data=${data_dir} model=${model_dir}"
if [[ ! -d ${model_dir} ]]; then
mkdir -p ${model_dir}
else
echo "${model_dir} exists."
fi
cp ${BASH_SOURCE[0]} ${model_dir}
cp ${PWD}/train.sh ${model_dir}
config_list="${train_config//,/ }"
idx=0
for config in ${config_list[@]}
do
config_path=$pwd_dir/conf/${config}.yaml
if [[ ! -f ${config_path} ]]; then
echo "No config file ${config_path}"
exit
fi
cp ${config_path} ${model_dir}
if [[ idx -eq 0 ]]; then
extra_parameter="${extra_parameter}
--train-config ${config_path}"
else
extra_parameter="${extra_parameter}
--train-config${idx} ${config_path}"
fi
idx=$((idx + 1))
done
cmd="python3 -u ${root_dir}/fairseq_cli/train.py
${data_dir}
--config-yaml ${data_config}
--task ${task}
--max-tokens ${max_tokens}
--skip-invalid-size-inputs-valid-test
--update-freq ${update_freq}
--log-interval 100
--save-dir ${model_dir}
--tensorboard-logdir ${model_dir}"
if [[ -n ${extra_parameter} ]]; then
cmd="${cmd}
${extra_parameter}"
fi
if [[ ${gpu_num} -gt 0 ]]; then
cmd="${cmd}
--distributed-world-size $gpu_num
--ddp-backend no_c10d"
fi
if [[ $fp16 -eq 1 ]]; then
cmd="${cmd}
--fp16"
fi
if [[ $step_valid -eq 1 ]]; then
validate_interval=1
save_interval=1
keep_last_epochs=10
no_epoch_checkpoints=0
save_interval_updates=500
keep_interval_updates=10
else
validate_interval=1
keep_last_epochs=10
fi
if [[ $bleu_valid -eq 1 ]]; then
cmd="$cmd
--eval-bleu
--eval-bleu-args '{\"beam\": 1}'
--eval-tokenized-bleu
--eval-bleu-remove-bpe
--best-checkpoint-metric bleu
--maximize-best-checkpoint-metric"
fi
if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
cmd="$cmd
--no-epoch-checkpoints"
fi
if [[ -n $validate_interval ]]; then
cmd="${cmd}
--validate-interval $validate_interval "
fi
if [[ -n $save_interval ]]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
if [[ -n $keep_interval_updates ]]; then
cmd="${cmd}
--keep-interval-updates $keep_interval_updates"
fi
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
fi
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: ST Decoding"
if [[ ${n_average} -ne 1 ]]; then
# Average models
dec_model=avg_${n_average}_checkpoint.pt
cmd="python ${root_dir}/scripts/average_checkpoints.py
--inputs ${model_dir}
--num-epoch-checkpoints ${n_average}
--output ${model_dir}/${dec_model}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval $cmd
else
dec_model=${dec_model}
fi
if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
if [[ ${gpu_num} -eq 0 ]]; then
device=""
else
source ./local/utils.sh
device=$(get_devices $gpu_num 0)
fi
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in "${test_subset[@]}"; do
subset=${subset}_st
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}
--scoring sacrebleu"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
if [[ $eval -eq 1 ]]; then
eval $cmd
tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
fi
done
cat ${result_file}
fi
#! /bin/bash
# training the model
gpu_num=8
update_freq=1
max_tokens=40000
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
#config_list=(base)
config_list=(ctc)
#config_list=(sate_ctc)
#config_list=(ctc conformer rpr)
#config_list=(base sate)
#config_list=(pds_base)
#config_list=(pds_base conformer)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
......@@ -13,6 +13,8 @@ from itertools import groupby
from tempfile import NamedTemporaryFile
import string
import csv
import yaml
import copy
import numpy as np
import pandas as pd
......@@ -50,8 +52,6 @@ class AudioDataset(Dataset):
tokenizer: bool = False) -> None:
_root = Path(root) / "data" / split
wav_root, txt_root = _root / "wav", _root / "txt"
if tokenizer:
txt_root = _root / "txt.tok"
assert wav_root.is_dir() and txt_root.is_dir(), (_root, wav_root, txt_root)
self.use_raw = use_raw
......@@ -62,25 +62,30 @@ class AudioDataset(Dataset):
yaml_file = txt_root / f"{split}.yaml"
if yaml_file.is_file():
self.mode = "yaml"
try:
import yaml
except ImportError:
print("Please install PyYAML to load the MuST-C YAML files")
with open(yaml_file) as f:
segments = yaml.load(f, Loader=yaml.BaseLoader)
total_length = len(segments)
if 0 < self.size < total_length:
segments = segments[:self.size]
# for idx, seg in enumerate(content):
# segments[idx] = seg
# if 0 < self.size < idx:
# break
else:
self.mode = "easy"
segments = dict()
audio_file = txt_root / f"{split}.audio"
assert audio_file.is_file(), audio_file
with open(audio_file) as f:
audios = [line.strip() for line in f.readlines()]
total_length = len(audios)
segments = dict()
if 0 < self.size < total_length:
audios = audios[:self.size]
for idx, audio in enumerate(audios):
segments[idx] = {"audio": audio}
if 0 < self.size < idx:
break
# Load source and target utterances
self.have_src_utt = False
......@@ -88,31 +93,44 @@ class AudioDataset(Dataset):
for _lang in [src_lang, tgt_lang]:
if _lang is None:
continue
if Path.exists(txt_root / f"{split}.{_lang}"):
txt_path = txt_root / f"{split}.{_lang}"
if tokenizer:
txt_path = txt_root / f"{split}.{_lang}.tok"
if Path.exists(txt_path):
if _lang == src_lang:
self.have_src_utt = True
else:
self.have_tgt_utt = True
with open(txt_root / f"{split}.{_lang}") as f:
with open(txt_path) as f:
utterances = [r.strip() for r in f]
assert len(audios) == len(utterances)
assert total_length == len(utterances), (total_length, len(utterances))
if 0 < self.size < total_length:
utterances = utterances[:self.size]
for idx, u in enumerate(utterances):
segments[idx][_lang] = u
if 0 < self.size < idx:
break
# Gather info
self.data = dict()
if self.mode == "easy":
real_idx = 0
for idx, v in segments.items():
audio_name = v["audio"]
v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
if self.speed_perturb is not None:
for perturb in self.speed_perturb:
v["sp"] = perturb
v["id"] = f"{v['audio']}_sp{perturb}"
sp_item = copy.deepcopy(v)
sp_item["perturb"] = perturb
sp_item["id"] = f"{audio_name}_sp{perturb}"
self.data[real_idx] = sp_item
real_idx += 1
else:
v["id"] = v['audio']
v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
self.data[idx] = v
v["id"] = audio_name
self.data[real_idx] = v
real_idx += 1
if 0 < self.size <= real_idx:
break
elif self.mode == "yaml":
idx = 0
......@@ -129,14 +147,14 @@ class AudioDataset(Dataset):
item["audio"] = wav_path.as_posix()
item["offset"] = offset
item["n_frames"] = n_frames
item["sample_rate"] = sample_rate,
item["sample_rate"] = sample_rate
item[src_lang] = segment[src_lang]
if tgt_lang is not None:
item[tgt_lang] = segment[tgt_lang]
if self.speed_perturb is not None:
for perturb in self.speed_perturb:
sp_item = item
sp_item = copy.deepcopy(item)
sp_item["id"] = f"{_id}_sp{perturb}"
sp_item["perturb"] = perturb
self.data[idx] = sp_item
......@@ -145,7 +163,7 @@ class AudioDataset(Dataset):
item["id"] = _id
self.data[idx] = item
idx += 1
if 0 < self.size < idx:
if 0 < self.size <= idx:
break
def __getitem__(self, n: int):
......@@ -155,7 +173,7 @@ class AudioDataset(Dataset):
item = self.data[n]
audio = item["audio"]
if getattr(item, "n_frames", False) and getattr(item, "sample_rate", False):
if item.get("n_frames", False) and item.get("sample_rate", False):
n_frames = item["n_frames"]
sample_rate = item["sample_rate"]
else:
......@@ -164,16 +182,19 @@ class AudioDataset(Dataset):
n_frames = info.num_frames
waveform = None
if item.get("perturb", False):
n_frames = n_frames / item['perturb']
if need_waveform:
if getattr(item, "offset", False):
offset = item.get('offset', False)
if offset:
waveform, sample_rate = torchaudio.load(audio,
frame_offset=item["sample_rate"],
frame_offset=offset,
num_frames=item["n_frames"])
else:
waveform, sample_rate = torchaudio.load(audio)
if getattr(item, "perturb", False):
n_frames = n_frames / item['perturb']
if item.get("perturb", False):
effects = [
["speed", f"{item['perturb']}"],
["rate", f"{sample_rate}"]
......@@ -204,6 +225,7 @@ def process(args):
output_root = Path(args.output_root).absolute()
# Extract features
datasets = dict()
use_raw = args.raw
size = args.size
if args.speed_perturb:
......@@ -233,6 +255,8 @@ def process(args):
src_lang, tgt_lang, split,
args.speed_perturb, size, use_raw,
args.tokenizer)
if split not in datasets:
datasets[split] = dataset
if is_train_split and args.cmvn_type == "global":
print("And estimating cepstral mean and variance stats...")
......@@ -287,10 +311,13 @@ def process(args):
is_train_split = split.startswith("train")
manifest = {c: [] for c in MANIFEST_COLUMNS}
dataset = AudioDataset(root.as_posix(),
src_lang, tgt_lang, split,
args.speed_perturb, size, use_raw,
args.tokenizer)
if split in datasets:
dataset = datasets[split]
else:
dataset = AudioDataset(root.as_posix(),
src_lang, tgt_lang, split,
args.speed_perturb, size, use_raw,
args.tokenizer)
if args.task == "st" and args.add_src and dataset.have_src_utt:
manifest["src_text"] = []
for idx in tqdm(range(len(dataset))):
......@@ -303,7 +330,7 @@ def process(args):
audio_path = item["audio"]
# add offset and frames info
if getattr(item, "offset", False):
if item.get("offset", False):
audio_path = f"{audio_path}:{item['offset']}:{n_frames}"
manifest["audio"].append(audio_path)
else:
......@@ -371,7 +398,7 @@ def process(args):
if len(train_text) == 0:
print("Loading the training text to build dictionary...")
for split in args.SPLITS:
for split in splits:
if split.startswith("train"):
csv_path = output_root / f"{split}.tsv"
with open(csv_path) as f:
......@@ -384,18 +411,18 @@ def process(args):
quoting=csv.QUOTE_NONE,
)
if task == "st" and args.add_src and args.share:
for e in reader:
src_utt = dict(e)["src_text"]
if args.lowercase_src:
src_utt = src_utt.lower()
if args.rm_punc_src:
for w in string.punctuation:
src_utt = src_utt.replace(w, "")
src_utt = " ".join(src_utt.split(" "))
train_text.append(src_utt)
tgt_text = [dict(e)["tgt_text"] for e in reader]
train_text.extend(tgt_text)
if task == "st" and args.add_src and args.share:
for e in reader:
src_utt = dict(e)["src_text"]
if args.lowercase_src:
src_utt = src_utt.lower()
if args.rm_punc_src:
for w in string.punctuation:
src_utt = src_utt.replace(w, "")
src_utt = " ".join(src_utt.split(" "))
train_text.append(src_utt)
tgt_text = [dict(e)["tgt_text"] for e in reader]
train_text.extend(tgt_text)
with NamedTemporaryFile(mode="w") as f:
for t in train_text:
......
......@@ -33,8 +33,8 @@ class MTDataset(Dataset):
"""
def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None:
_root = Path(root) / "data"
txt_root = _root
_root = Path(root) / "data" / split
txt_root = _root / "txt" if (_root / "txt").is_dir() else _root
assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
# Load source and target text
self.data = []
......
......@@ -23,7 +23,6 @@ from fairseq.file_io import PathManager
from fairseq.models import FairseqDecoder, FairseqEncoder
from omegaconf import Container, DictConfig, open_dict, OmegaConf
logger = logging.getLogger(__name__)
......@@ -62,23 +61,28 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
suffix = trainer.checkpoint_suffix
checkpoint_conds = collections.OrderedDict()
checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
)
checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
not end_of_epoch
and cfg.save_interval_updates > 0
and updates % cfg.save_interval_updates == 0
not end_of_epoch
and cfg.save_interval_updates > 0
and updates % cfg.save_interval_updates == 0
)
checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
not hasattr(save_checkpoint, "best")
or is_better(val_loss, save_checkpoint.best)
not hasattr(save_checkpoint, "best")
or is_better(val_loss, save_checkpoint.best)
)
if val_loss is not None and cfg.keep_best_checkpoints > 0:
if not end_of_epoch and cfg.save_interval_updates > 0 and updates % cfg.save_interval_updates == 0:
epoch_or_step = updates
else:
epoch_or_step = epoch
checkpoint_conds[
"checkpoint.best_{}_{:.2f}.pt".format(cfg.best_checkpoint_metric, val_loss)
] = not hasattr(save_checkpoint, "best") or is_better(
val_loss, save_checkpoint.best
)
"checkpoint.best_{}_{}_{:.3f}.pt".format(cfg.best_checkpoint_metric, epoch_or_step, val_loss)
] = True
# not hasattr(save_checkpoint, "best") or is_better(
# val_loss, save_checkpoint.best
# )
checkpoint_conds[
"checkpoint_last{}.pt".format(suffix)
] = not cfg.no_last_checkpoints
......@@ -117,14 +121,14 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
checkpoints = checkpoint_paths(
cfg.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt"
)
for old_chk in checkpoints[cfg.keep_interval_updates :]:
for old_chk in checkpoints[cfg.keep_interval_updates:]:
if os.path.lexists(old_chk):
os.remove(old_chk)
if cfg.keep_last_epochs > 0:
# remove old epoch checkpoints; checkpoints are sorted in descending order
checkpoints = checkpoint_paths(cfg.save_dir, pattern=r"checkpoint(\d+)\.pt")
for old_chk in checkpoints[cfg.keep_last_epochs :]:
for old_chk in checkpoints[cfg.keep_last_epochs:]:
if os.path.lexists(old_chk):
os.remove(old_chk)
......@@ -132,13 +136,13 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
# only keep the best N checkpoints according to validation metric
checkpoints = checkpoint_paths(
cfg.save_dir,
pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format(
pattern=r"checkpoint\.best_{}_\d+_(\d+\.?\d*)\.pt".format(
cfg.best_checkpoint_metric
),
)
if not cfg.maximize_best_checkpoint_metric:
checkpoints = checkpoints[::-1]
for old_chk in checkpoints[cfg.keep_best_checkpoints :]:
for old_chk in checkpoints[cfg.keep_best_checkpoints:]:
if os.path.lexists(old_chk):
os.remove(old_chk)
......@@ -158,7 +162,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
reset_dataloader = cfg.reset_dataloader
if cfg.finetune_from_model is not None and (
reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
):
raise ValueError(
"--finetune-from-model can not be set together with either --reset-optimizer"
......@@ -167,7 +171,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
suffix = trainer.checkpoint_suffix
if (
cfg.restore_file == "checkpoint_last.pt"
cfg.restore_file == "checkpoint_last.pt"
): # default value of restore_file is 'checkpoint_last.pt'
checkpoint_path = os.path.join(
cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
......@@ -210,10 +214,10 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
)
if (
extra_state is not None
and "best" in extra_state
and not reset_optimizer
and not reset_meters
extra_state is not None
and "best" in extra_state
and not reset_optimizer
and not reset_meters
):
save_checkpoint.best = extra_state["best"]
......@@ -297,13 +301,13 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):
def load_model_ensemble(
filenames,
arg_overrides: Optional[Dict[str, Any]] = None,
task=None,
strict=True,
suffix="",
num_shards=1,
state=None,
filenames,
arg_overrides: Optional[Dict[str, Any]] = None,
task=None,
strict=True,
suffix="",
num_shards=1,
state=None,
):
"""Loads an ensemble of models.
......@@ -314,7 +318,7 @@ def load_model_ensemble(
task (fairseq.tasks.FairseqTask, optional): task to use for loading
"""
assert not (
strict and num_shards > 1
strict and num_shards > 1
), "Cannot load state dict with strict=True and checkpoint shards > 1"
ensemble, args, _task = load_model_ensemble_and_task(
filenames,
......@@ -329,20 +333,20 @@ def load_model_ensemble(
def load_model_ensemble_and_task(
filenames,
arg_overrides: Optional[Dict[str, Any]] = None,
task=None,
strict=True,
suffix="",
num_shards=1,
state=None,
filenames,
arg_overrides: Optional[Dict[str, Any]] = None,
task=None,
strict=True,
suffix="",
num_shards=1,
state=None,
):
assert state is None or len(filenames) == 1
from fairseq import tasks
assert not (
strict and num_shards > 1
strict and num_shards > 1
), "Cannot load state dict with strict=True and checkpoint shards > 1"
ensemble = []
cfg = None
......@@ -483,7 +487,7 @@ def _upgrade_state_dict(state):
state["optimizer_history"][-1]["num_updates"] = 0
# old model checkpoints may not have separate source/target positions
if "args" in state and hasattr(state["args"], "max_positions") and not hasattr(
state["args"], "max_source_positions"
state["args"], "max_source_positions"
):
state["args"].max_source_positions = state["args"].max_positions
state["args"].max_target_positions = state["args"].max_positions
......@@ -518,14 +522,14 @@ def _upgrade_state_dict(state):
del state["args"].min_lr
# binary_cross_entropy => wav2vec criterion
if (
hasattr(state["args"], "criterion")
and state["args"].criterion == "binary_cross_entropy"
hasattr(state["args"], "criterion")
and state["args"].criterion == "binary_cross_entropy"
):
state["args"].criterion = "wav2vec"
# speech_pretraining => audio pretraining
if (
hasattr(state["args"], "task")
and state["args"].task == "speech_pretraining"
hasattr(state["args"], "task")
and state["args"].task == "speech_pretraining"
):
state["args"].task = "audio_pretraining"
# audio_cpc => wav2vec
......@@ -536,9 +540,9 @@ def _upgrade_state_dict(state):
state["args"].lr = [state["args"].lr]
# convert task data arg to a string instead of List[string]
if (
hasattr(state["args"], "data")
and isinstance(state["args"].data, list)
and len(state["args"].data) > 0
hasattr(state["args"], "data")
and isinstance(state["args"].data, list)
and len(state["args"].data) > 0
):
state["args"].data = state["args"].data[0]
......@@ -549,23 +553,23 @@ def _upgrade_state_dict(state):
with open_dict(cfg):
# any upgrades for Hydra-based configs
if (
"task" in cfg
and "eval_wer_config" in cfg.task
and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
"task" in cfg
and "eval_wer_config" in cfg.task
and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
):
cfg.task.eval_wer_config.print_alignment = "hard"
if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
cfg.generation.print_alignment = "hard"
if (
"model" in cfg
and "w2v_args" in cfg.model
and cfg.model.w2v_args is not None
and (
"model" in cfg
and "w2v_args" in cfg.model
and cfg.model.w2v_args is not None
and (
hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
)
and isinstance(
cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
)
)
and isinstance(
cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
)
):
cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"
......@@ -644,9 +648,9 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
layer_name
)
new_state_key = (
layer_name[: substitution_match.start(1)]
+ new_layer_number
+ layer_name[substitution_match.end(1) :]
layer_name[: substitution_match.start(1)]
+ new_layer_number
+ layer_name[substitution_match.end(1):]
)
new_state_dict[new_state_key] = state_dict[layer_name]
......@@ -666,7 +670,7 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
def load_pretrained_component_from_model(
component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True,
component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True,
):
"""
Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
......@@ -699,7 +703,7 @@ def load_pretrained_component_from_model(
for key in state["model"].keys():
if key.startswith(component_type):
# encoder.input_layers.0.0.weight --> input_layers.0.0.weight
component_subkey = key[len(component_type) + 1 :]
component_subkey = key[len(component_type) + 1:]
component_state_dict[component_subkey] = state["model"][key]
mismatch_keys = []
......
......@@ -251,7 +251,7 @@ class CtcCriterion(FairseqCriterion):
if c_total > 0:
metrics.log_derived(
"uer",
"cer",
lambda meters: safe_round(
meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
)
......
......@@ -92,14 +92,14 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
logging_output["total"] = utils.item(total.data)
if self.ctc_weight > 0:
ctc_loss = self.compute_ctc_loss(model, sample, encoder_out)
ctc_loss, logging_output = self.compute_ctc_loss(model, sample, encoder_out, logging_output)
logging_output["ctc_loss"] = utils.item(ctc_loss.data)
loss = (1 - self.ctc_weight) * loss + self.ctc_weight * ctc_loss
logging_output["loss"] = utils.item(loss.data) if reduce else loss.data
return loss, sample_size, logging_output
def compute_ctc_loss(self, model, sample, encoder_out):
def compute_ctc_loss(self, model, sample, encoder_out, logging_output):
transcript = sample["transcript"]
ctc_logit = encoder_out["ctc_logit"][0]
lprobs = model.get_normalized_probs(
......@@ -124,9 +124,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
transcript_lengths,
)
logging_output = {
"ctc_loss": utils.item(loss.data), # * sample['ntokens'],
}
logging_output["ctc_loss"] = utils.item(loss.data)
if not model.training:
import editdistance
......@@ -182,7 +180,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
logging_output["c_errors"] = c_err
logging_output["c_total"] = c_len
return loss
return loss, logging_output
@staticmethod
def reduce_metrics(logging_outputs) -> None:
......@@ -205,18 +203,20 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
metrics.log_scalar(
"loss", loss_sum / sample_size / math.log(2), sample_size, round=3
)
metrics.log_scalar(
"trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3
)
if trans_loss_sum != loss_sum:
metrics.log_scalar(
"trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3
)
metrics.log_scalar(
"nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
)
metrics.log_scalar(
"ctc_loss",
ctc_loss_sum / sample_size / math.log(2),
sample_size,
round=3,
)
if ctc_loss_sum > 0:
metrics.log_scalar(
"ctc_loss",
ctc_loss_sum / sample_size / math.log(2),
sample_size,
round=3,
)
metrics.log_derived(
"ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
)
......@@ -250,7 +250,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
if c_total > 0:
metrics.log_derived(
"uer",
"cer",
lambda meters: safe_round(
meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
)
......
......@@ -6,7 +6,10 @@ import torch
import torchaudio
def get_waveform(
path_or_fp: Union[str, BinaryIO], normalization=True
path_or_fp: Union[str, BinaryIO],
normalization=True,
offset=None,
size=None
) -> Tuple[np.ndarray, int]:
"""Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC.
......@@ -19,7 +22,10 @@ def get_waveform(
if ext not in {".flac", ".wav"}:
raise ValueError(f"Unsupported audio format: {ext}")
waveform, sample_rate = torchaudio.load(path_or_fp)
if offset is not None and size is not None:
waveform, sample_rate = torchaudio.load(path_or_fp, frame_offset=offset, num_frames=size)
else:
waveform, sample_rate = torchaudio.load(path_or_fp)
waveform = waveform.squeeze().numpy()
if not normalization:
......@@ -73,12 +79,17 @@ def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarr
return None
def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray:
def get_fbank(
path_or_fp: Union[str, BinaryIO],
n_bins=80,
offset=None,
size=None,
) -> np.ndarray:
"""Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
(faster CPP implementation) to TorchAudio (Python implementation). Note that
Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
waveform should not be normalized."""
sound, sample_rate = get_waveform(path_or_fp, normalization=False)
sound, sample_rate = get_waveform(path_or_fp, normalization=False, offset=offset, size=size)
features = _get_kaldi_fbank(sound, sample_rate, n_bins)
if features is None:
......
......@@ -182,6 +182,14 @@ def get_features_or_waveform_from_uncompressed_zip(
return features_or_waveform
def get_features_or_waveform_from_audio(
path, offset, size, need_waveform=False
):
assert path.endswith(".wav")
features_or_waveform = get_waveform(path, offset=offset, size=size)[0] if need_waveform \
else get_fbank(path, offset=offset, size=size)
return features_or_waveform
def get_features_or_waveform(path: str, need_waveform=False):
"""Get speech features from .npy file or waveform from .wav/.flac file.
The file may be inside an uncompressed ZIP file and is accessed via byte
......@@ -205,9 +213,14 @@ def get_features_or_waveform(path: str, need_waveform=False):
return get_features_from_npy_or_audio(_path)
elif len(extra) == 2:
extra = [int(i) for i in extra]
features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
_path, extra[0], extra[1], need_waveform=need_waveform
)
if _path.endswith('.zip'):
features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
_path, extra[0], extra[1], need_waveform=need_waveform
)
else:
features_or_waveform = get_features_or_waveform_from_audio(
_path, extra[0], extra[1], need_waveform=need_waveform
)
else:
raise ValueError(f"Invalid path: {path}")
......
......@@ -54,12 +54,12 @@ class DLCLTransformerModel(TransformerModel):
TransformerModel.add_args(parser)
# dense layer parameters
parser.add_argument('--encoder-history-type',
default="learnable_dense",
help='encoder layer history type')
parser.add_argument('--decoder-history-type',
default="learnable_dense",
help='decoder layer history type')
# parser.add_argument('--encoder-history-type',
# default="learnable_dense",
# help='encoder layer history type')
# parser.add_argument('--decoder-history-type',
# default="learnable_dense",
# help='decoder layer history type')
parser.add_argument('--encoder-integration-type', choices=['avg', 'sum'],
help='encoder layer integration type')
parser.add_argument('--decoder-integration-type', choices=['avg', 'sum'],
......
......@@ -463,7 +463,11 @@ class PDSS2TTransformerModel(S2TTransformerModel):
type=float,
help="dropout in each stage",
)
parser.add_argument(
"--ctc-layer",
type=int,
help="the layer of ctc",
)
pass
@classmethod
......@@ -516,8 +520,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
else:
self.pds_attn_ds_ratios = None
self.fusion = getattr(args, "pds_fusion", False)
self.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
self.pds_fusion = args.pds_fusion
self.pds_fusion_method = args.pds_fusion_method
self.pds_fusion_transform = "conv"
if len(self.pds_fusion_method.split("_")) == 2:
items = self.pds_fusion_method.split("_")
......@@ -525,10 +530,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.pds_fusion_transform = items[1]
fusion_stages_num = 0
if self.fusion:
if self.pds_fusion_way == "all":
if self.pds_fusion:
if self.pds_fusion_method == "all":
fusion_stages_num = self.pds_stages
elif self.pds_fusion_way == "same":
elif self.pds_fusion_method == "same":
for dim in self.pds_embed_dims:
if dim == self.embed_dim:
fusion_stages_num += 1
......@@ -555,7 +560,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
"fusion {}, fusion method {}, fusion transformer {}.".
format(i, num_layers, ds_ratio, embed_dim,
kernel_size, use_pos_embed, ffn_ratio, num_head,
self.fusion, self.pds_fusion_method, self.pds_fusion_transform))
self.pds_fusion, self.pds_fusion_method, self.pds_fusion_transform))
if i == 0:
self.embed_scale = math.sqrt(embed_dim)
......@@ -588,7 +593,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.pds_fusion_method == "same" and self.embed_dim == embed_dim
):
if i != self.pds_stages - 1:
ratio = reduce(lambda a, b: a * b, self.pds_sr_ratios[i + 1:])
ratio = reduce(lambda a, b: a * b, self.pds_ratios[i + 1:])
else:
ratio = 1
......@@ -636,8 +641,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
(("ctc" in getattr(args, "criterion", False)) and
(getattr(args, "ctc_weight", False) > 0))
if self.use_ctc:
self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
# self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
# self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
self.ctc_layer = args.encoder_layers
self.inter_ctc = True if self.ctc_layer != 0 else False
if self.inter_ctc:
logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)
......@@ -655,7 +662,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
dropout=args.dropout,
need_layernorm=True if self.inter_ctc else False)
if task.source_dictionary == task.target_dictionary:
if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
self.ctc.ctc_projection.weight = embed_tokens.weight
if args.encoder_normalize_before:
......@@ -747,7 +754,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
self.add_to_dict(x, dis, cos_sim_idx)
if self.use_ctc and self.inter_ctc and self.ctc_layer == layer_idx:
ctc_logit = self.CTC(x)
ctc_logit = self.ctc(x.clone())
prev_state.append(x)
prev_padding.append(encoder_padding_mask)
......@@ -787,7 +794,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
return {
"encoder_out": [x], # T x B x C
"ctc_logit": [ctc_logit], # T x B x C
"ctc_logit": [] if ctc_logit is None else [ctc_logit], # T x B x C
"encoder_padding_mask": [encoder_padding_mask], # B x T
"encoder_embedding": [], # B x T x C
"encoder_states": [], # List[T x B x C]
......@@ -800,10 +807,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
[] if len(encoder_out["encoder_out"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
)
new_ctc_logit = (
[] if len(encoder_out["ctc_logit"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]]
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
)
new_encoder_padding_mask = (
......@@ -823,7 +829,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
return {
"encoder_out": new_encoder_out, # T x B x C
"ctc_logit": [new_ctc_logit], # T x B x C
"ctc_logit": new_ctc_logit, # T x B x C
"encoder_padding_mask": new_encoder_padding_mask, # B x T
"encoder_embedding": new_encoder_embedding, # B x T x C
"encoder_states": encoder_states, # List[T x B x C]
......@@ -901,8 +907,8 @@ def base_architecture(args):
args.ctc_layer = getattr(args, "ctc_layer", 0)
args.pds_dropout = getattr(args, "pds_dropout", args.dropout)
args.fusion = getattr(args, "fusion", False)
args.fusion_method = getattr(args, "fusion_method", "all_conv")
args.pds_fusion = getattr(args, "pds_fusion", False)
args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
def set_pds_base_8(args):
......
......@@ -130,8 +130,8 @@ class S2TSATEModel(S2TTransformerModel):
component=encoder.text_encoder, checkpoint=args.load_pretrained_text_encoder_from, strict=False
)
if args.share_ctc_and_adapter and hasattr(encoder.adapter, "linear_adapter"):
encoder.acoustic_encoder.ctc_projection.weight = encoder.adapter.linear_adapter[0].weight
if args.share_ctc_and_adapter and hasattr(encoder.adapter, "embed_adapter"):
encoder.acoustic_encoder.ctc.ctc_projection.weight = encoder.adapter.embed_adapter.weight
return encoder
......@@ -175,10 +175,7 @@ class Adapter(nn.Module):
self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
self.out_layernorm = LayerNorm(embed_dim)
# self.out_layernorm = nn.Identity()
def forward(self, x, padding):
def forward(self, x, padding):
representation, distribution = x
batch, seq_len, embed_dim = representation.size()
......@@ -188,30 +185,25 @@ class Adapter(nn.Module):
if self.adapter_type == "linear":
out = self.linear_adapter(representation)
out = self.out_layernorm(out)
elif self.adapter_type == "context":
out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
out = self.out_layernorm(out)
elif self.adapter_type == "subsample":
representation = representation.transpose(0, 1)
out, input_lengths = self.subsample_adaptor(representation, lengths)
padding = lengths_to_padding_mask(input_lengths)
out = self.out_layernorm(out)
elif self.adapter_type == "league":
linear_out = self.linear_adapter(representation)
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
out = linear_out + soft_out
out = self.out_layernorm(out)
elif self.adapter_type == "gated_league":
linear_out = self.linear_adapter(representation)
soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
out = coef * linear_out + (1 - coef) * soft_out
out = self.out_layernorm(out)
elif self.adapter_type == "none":
out = representation
......@@ -223,10 +215,10 @@ class Adapter(nn.Module):
return out, padding
class TextEncoder(nn.Module):
class TextEncoder(FairseqEncoder):
def __init__(self, args, dictionary):
super().__init__()
super().__init__(None)
self.embed_tokens = None
......@@ -285,9 +277,9 @@ class S2TSATEEncoder(FairseqEncoder):
# acoustic encoder
acoustic_encoder_type = args.acoustic_encoder
if acoustic_encoder_type == "transformer":
self.acoustic_encoder = S2TTransformerEncoder(args, task, embed_tokens)
self.acoustic_encoder = S2TTransformerEncoder(args, task)
elif acoustic_encoder_type == "pds":
self.acoustic_encoder = PDSS2TTransformerEncoder(args, task, embed_tokens)
self.acoustic_encoder = PDSS2TTransformerEncoder(args, task)
else:
logging.error("Unsupported model arch {}!".format(acoustic_encoder_type))
......@@ -295,8 +287,8 @@ class S2TSATEEncoder(FairseqEncoder):
self.temperature = args.temperature
self.adapter = Adapter(args, task.source_dictionary, embed_tokens)
if args.share_ctc_and_adapter and hasattr(self.adapter, "linear_adapter"):
self.acoustic_encoder.ctc_projection.weight = self.adapter.linear_adapter[0].weight
if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight
# self.length_adapter = Conv1dSubsampler(
# args.encoder_embed_dim,
......@@ -463,6 +455,7 @@ def base_architecture(args):
args.temperature = getattr(args, "temperature", 1.0)
args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
args.text_attention_type = getattr(args, "text_attention_type", "selfattn")
args.share_ctc_and_adapter = getattr(args, "share_ctc_and_adapter", False)
# PDS
args.pds_stages = getattr(args, "pds_stages", None)
......
......@@ -511,8 +511,8 @@ class S2TTransformerEncoder(FairseqEncoder):
self.use_ctc = "sate" in args.arch or \
(("ctc" in getattr(args, "criterion", "")) and (getattr(args, "ctc_weight", 0) > 0))
if self.use_ctc:
self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
self.ctc_layer = args.ctc_layer
self.inter_ctc = True if self.ctc_layer != 0 and self.ctc_layer != args.encoder_layers else False
if self.inter_ctc:
logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)
self.ctc = CTC(args.encoder_embed_dim,
......@@ -520,7 +520,7 @@ class S2TTransformerEncoder(FairseqEncoder):
dropout=args.dropout,
need_layernorm=True if self.inter_ctc else False)
if task.source_dictionary == task.target_dictionary:
if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
self.ctc.ctc_projection.weight = embed_tokens.weight
self.interleaved_dropout = getattr(args, "interleave_dropout", None)
......@@ -554,7 +554,6 @@ class S2TTransformerEncoder(FairseqEncoder):
def forward(self, src_tokens, src_lengths):
ctc_input = None
if self.history is not None:
self.history.clean()
......@@ -632,7 +631,7 @@ class S2TTransformerEncoder(FairseqEncoder):
return {
"encoder_out": [x], # T x B x C
"ctc_logit": [ctc_logit], # B x T x C
"ctc_logit": [] if ctc_logit is None else [ctc_logit], # B x T x C
"encoder_padding_mask": [encoder_padding_mask], # B x T
"encoder_embedding": [], # B x T x C
"encoder_states": [], # List[T x B x C]
......@@ -648,7 +647,7 @@ class S2TTransformerEncoder(FairseqEncoder):
new_ctc_logit = (
[] if len(encoder_out["ctc_logit"]) == 0
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]]
else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
)
new_encoder_padding_mask = (
......
......@@ -218,6 +218,28 @@ class TransformerModel(FairseqEncoderDecoderModel):
],
help="transformer decoder self-attention layer type"
)
parser.add_argument(
"--use-enc-dlcl",
default=False,
action='store_true',
help="use dlcl encoder",
)
parser.add_argument(
"--use-dec-dlcl",
default=False,
action='store_true',
help="use dlcl encoder",
)
parser.add_argument(
'--encoder-history-type',
default="learnable_dense",
help='encoder layer history type'
)
parser.add_argument(
'--decoder-history-type',
default="learnable_dense",
help='decoder layer history type'
)
parser.add_argument('--max-encoder-relative-length', type=int, default=-1,
help='the max encoder relative length')
parser.add_argument('--max-decoder-relative-length', type=int, default=-1,
......@@ -474,6 +496,11 @@ class TransformerEncoder(FairseqEncoder):
else:
self.layer_norm = None
if getattr(args, "use_enc_dlcl", False):
self.history = CreateLayerHistory(args, is_encoder=True)
else:
self.history = None
def build_encoder_layer(self, args):
layer = TransformerEncoderLayer(args)
if getattr(args, "checkpoint_activations", False):
......@@ -571,6 +598,9 @@ class TransformerEncoder(FairseqEncoder):
encoder_padding_mask = src_tokens.eq(self.padding_idx)
has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any())
if self.history is not None:
self.history.clean()
x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
# account for padding while computing the representation
......@@ -585,8 +615,15 @@ class TransformerEncoder(FairseqEncoder):
if return_all_hiddens:
encoder_states.append(x)
# add emb into history
if self.history is not None:
self.history.add(x)
# encoder layers
for layer in self.layers:
if self.history is not None:
x = self.history.pop()
x = layer(
x, encoder_padding_mask=encoder_padding_mask if has_pads else None
)
......@@ -594,6 +631,12 @@ class TransformerEncoder(FairseqEncoder):
assert encoder_states is not None
encoder_states.append(x)
if self.history is not None:
self.history.add(x)
if self.history is not None:
x = self.history.pop()
if self.layer_norm is not None:
x = self.layer_norm(x)
......
......@@ -67,6 +67,9 @@ def main(cfg: FairseqConfig) -> None:
# Print args
logger.info(cfg)
with open(os.path.join(cfg.checkpoint.save_dir, "config.yaml"), 'w') as f:
f.write("%s" % OmegaConf.to_yaml(cfg))
if cfg.checkpoint.write_checkpoints_asynchronously:
try:
import iopath # noqa: F401
......
......@@ -73,33 +73,38 @@ def average_checkpoints(inputs):
return new_state
def last_n_checkpoints(paths, n, update_based, upper_bound=None):
def last_n_checkpoints(paths, n, combine_choice, upper_bound=None, max_metric=False):
assert len(paths) == 1
path = paths[0]
if update_based:
reverse = True
if combine_choice == "update":
pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
elif combine_choice == "best":
reverse = True if max_metric else False
pt_regexp = re.compile(r"checkpoint\.best_loss_\d+_(\d+\.?\d*)\.pt")
else:
pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
files = PathManager.ls(path)
entries = []
for f in files:
m = pt_regexp.fullmatch(f)
if m is not None:
sort_key = int(m.group(1))
sort_key = float(m.group(1))
if upper_bound is None or sort_key <= upper_bound:
entries.append((sort_key, m.group(0)))
if len(entries) < n:
raise Exception(
"Found {} checkpoint files but need at least {}", len(entries), n
)
return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
return [os.path.join(path, x[1]) for x in sorted(entries, reverse=reverse)[:n]]
def main():
parser = argparse.ArgumentParser(
description="Tool to average the params of input checkpoints to "
"produce a new checkpoint",
"produce a new checkpoint",
)
# fmt: off
parser.add_argument('--inputs', required=True, nargs='+',
......@@ -109,42 +114,55 @@ def main():
num_group = parser.add_mutually_exclusive_group()
num_group.add_argument('--num-epoch-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
'and average last this many of them.')
'and average last this many of them.')
num_group.add_argument('--num-update-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
'and average last this many of them.')
'and average last this many of them.')
num_group.add_argument('--num-best-checkpoints', type=int,
help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
'and average last this many of them.')
num_group.add_argument('--maximize-best-checkpoint-metric', default=False, action="store_true",
help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
'and average last this many of them.')
parser.add_argument('--checkpoint-upper-bound', type=int,
help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
'when using --num-update-checkpoints, this will set an upper bound on which update to use'
'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
'when using --num-update-checkpoints, this will set an upper bound on which update to use'
'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500.'
'e.g., with --num-best-checkpoints=10 --checkpoint-upper-bound=5, checkpoints (loss) <= 5 would be averaged.'
)
# fmt: on
args = parser.parse_args()
print(args)
num = None
is_update_based = False
combine_choice = "epoch"
max_metric = args.maximize_best_checkpoint_metric
if args.num_update_checkpoints is not None:
num = args.num_update_checkpoints
is_update_based = True
combine_choice = "update"
elif args.num_epoch_checkpoints is not None:
num = args.num_epoch_checkpoints
elif args.num_best_checkpoints is not None:
num = args.num_best_checkpoints
combine_choice = "best"
assert args.checkpoint_upper_bound is None or (
args.num_epoch_checkpoints is not None
or args.num_update_checkpoints is not None
), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
args.num_epoch_checkpoints is not None
or args.num_update_checkpoints is not None
or args.num_best_checkpoints is not None
), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints or --num-best-checkpoints"
assert (
args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"
if num is not None:
args.inputs = last_n_checkpoints(
args.inputs,
num,
is_update_based,
combine_choice,
upper_bound=args.checkpoint_upper_bound,
max_metric=max_metric
)
print("averaging checkpoints: ", args.inputs)
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论