fix the bugs

da4e7dc3 · xuchen · 093098e4 · da4e7dc3 · da4e7dc3 · da4e7dc3
Commit da4e7dc3 authored Dec 20, 2021 by xuchen
--- a/egs/libri_trans/asr/binary.sh
+++ b/egs/libri_trans/asr/binary.sh
--- a/egs/libri_trans/st/conf/base.yaml
+++ b/egs/libri_trans/st/conf/base.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_transformer_s
 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/aishell/asr/conf/basis.yaml
+++ b/egs/aishell/asr/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/conformer.yaml
+++ b/egs/libri_trans/asr/conf/conformer.yaml
--- a/egs/libri_trans/asr/conf/ctc.yaml
+++ b/egs/libri_trans/asr/conf/ctc.yaml
 ctc-weight: 0.3
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/base.yaml
+++ b/egs/libri_trans/asr/conf/base.yaml
@@ -15,6 +15,7 @@ report-accuracy: True
 #load-pretrained-decoder-from:

 arch: s2t_transformer_s
+#arch: pdss2t_transformer_s
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0

--- a/egs/libri_trans/asr/conf/dlcl.yaml
+++ b/egs/libri_trans/asr/conf/dlcl.yaml
--- a/egs/libri_trans/asr/conf/local_attn.yaml
+++ b/egs/libri_trans/asr/conf/local_attn.yaml
--- a/egs/libri_trans/asr/conf/pds_base.yaml
+++ b/egs/libri_trans/asr/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-fusion: True
+ctc-layer: 12

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/template/st/conf/pds_base.yaml
+++ b/egs/template/st/conf/pds_base.yaml
-arch: pdss2t_transformer_s_8
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+arch: pdss2t_transformer_s_16
+
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/template/asr/conf/pds_base.yaml
+++ b/egs/template/asr/conf/pds_base.yaml
-arch: pdss2t_transformer_s_8
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+arch: pdss2t_transformer_s_32
+
+encoder-embed-dim: 256
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/libri_trans/st/conf/pds_base.yaml
+++ b/egs/libri_trans/st/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8

-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/libri_trans/asr/conf/rpr.yaml
+++ b/egs/libri_trans/asr/conf/rpr.yaml
--- a/egs/libri_trans/asr/decode.sh
+++ b/egs/libri_trans/asr/decode.sh
@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi

+cer=1
 n_average=10
 beam_size=5
 len_penalty=1.0
@@ -21,6 +22,7 @@ cmd="./run.sh
    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
+    --cer ${cer}
    --n_average ${n_average}
    --beam_size ${beam_size}
    --len_penalty ${len_penalty}

--- a/egs/libri_trans/asr/local/monitor.sh
+++ b/egs/libri_trans/asr/local/monitor.sh
--- a/egs/libri_trans/asr/local/parse_options.sh
+++ b/egs/libri_trans/asr/local/parse_options.sh
--- a/egs/libri_trans/asr/local/utils.sh
+++ b/egs/libri_trans/asr/local/utils.sh
--- a/egs/libri_trans/asr/run.sh
+++ b/egs/libri_trans/asr/run.sh
@@ -24,31 +24,31 @@ stop_stage=0
 gpu_num=0
 update_freq=1

-s2t_dir=~/Code/st
-root_dir=${s2t_dir}/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
-src_lang=en
-tgt_lang=fr
-lang=${src_lang}-${tgt_lang}
+src_lang=zh
+lang=${src_lang}

-dataset=libri_trans
+dataset=aishell
 task=speech_to_text
 vocab_type=unigram
-vocab_size=1000
-speed_perturb=0
-lcrm=1
+vocab_type=char
+vocab_size=5000
+speed_perturb=1
+lcrm=0
 tokenizer=0
-use_raw_audio=1
+use_raw_audio=0

 use_specific_dict=0
 specific_prefix=st
-specific_dir=${s2t_dir}/data/mustc/st/en-de
+specific_dir=${root_dir}/data/mustc/st/en-de
 asr_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=${s2t_dir}/data/${dataset}
-data_dir=${s2t_dir}/data/${dataset}/asr
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
 train_split=train
 valid_split=dev
 test_split=test
@@ -71,6 +71,7 @@ max_tokens=40000
 step_valid=0

 # decoding setting
+cer=1
 dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
@@ -96,6 +97,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
    data_dir=${data_dir}_raw
    exp_prefix=${exp_prefix}_raw
 fi
+if [[ "${vocab_type}" == "char" ]]; then
+    data_dir=${data_dir}_char
+    exp_prefix=${exp_prefix}_char
+fi

 . ./local/parse_options.sh || exit 1;

@@ -106,7 +111,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
+model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -120,13 +125,19 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
    fi
+    if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
+        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
+        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
+    fi

-    cmd="python ${root_dir}/examples/speech_to_text/prep_audio_data.py
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
        --data-root ${org_data_dir}
        --output-root ${data_dir}
        --task asr
-        --splits ${train_split},${valid_split},${test_split}
        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
        --vocab-type ${vocab_type}
        --vocab-size ${vocab_size}"

@@ -135,7 +146,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --raw"
    fi
    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
        cmd="$cmd
        --asr-prefix ${asr_vocab_prefix}"
    fi
@@ -155,6 +166,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval ${cmd}
+
+    if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
+        mv ${data_dir}/fbank80.zip ${data_dir}/..
+        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
+        mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
+        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
+    fi
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -181,28 +201,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --config-yaml ${data_config}
        --task ${task}
@@ -286,12 +304,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt

-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
 	else
 		dec_model=${dec_model}
 	fi
@@ -311,8 +331,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

    test_subset=${test_subset//,/ }
 	for subset in ${test_subset[@]}; do
-        subset=${subset}_asr
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --config-yaml ${data_config}
        --gen-subset ${subset}
@@ -323,10 +343,13 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --beam ${beam_size}
        --lenpen ${len_penalty}
        --scoring wer
-        --wer-tokenizer 13a
-        --wer-lowercase
-        --wer-remove-punct
        "
+
+        if [[ ${cer} -eq 1 ]]; then
+            cmd="${cmd}
+        --wer-char-level"
+        fi
+
    	echo -e "\033[34mRun command: \n${cmd} \033[0m"

        if [[ $eval -eq 1 ]]; then

--- a/egs/libri_trans/asr/train.sh
+++ b/egs/libri_trans/asr/train.sh
--- a/egs/libri_trans/mt/binary.sh
+++ b/egs/libri_trans/mt/binary.sh
 set -e

 eval=1
+lcrm=0

 root_dir=~/st/Fairseq-S2T
 data_dir=/home/xuchen/st/data/wmt/test

--- a/egs/libri_trans/mt/conf/base.yaml
+++ b/egs/libri_trans/mt/conf/base.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: transformer
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
 criterion: label_smoothed_cross_entropy
 label_smoothing: 0.1

-dropout: 0.1
-attention-dropout: 0.1
-activation-dropout: 0.1
+dropout: 0.3
+attention-dropout: 0.0
+activation-dropout: 0.0

 activation-fn: relu
 encoder-normalize-before: True
 decoder-normalize-before: True
 encoder-embed-dim: 512
-encoder-ffn-embed-dim: 2048
+encoder-ffn-embed-dim: 1024
 encoder-layers: 6
 decoder-layers: 6
-encoder-attention-heads: 8
+encoder-attention-heads: 4

 decoder-embed-dim: 512
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 8
+decoder-ffn-embed-dim: 1024
+decoder-attention-heads: 4
+
+load-pretrained-encoder-from:
+load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt14/mt/conf/base_fair.yaml
+++ b/egs/iwslt14/mt/conf/base_fair.yaml
+arch: transformer_iwslt_de_en
+share-decoder-input-output-embed: True
+optimizer: adam
+#clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+weight-decay: 0.0001
+warmup-init-lr: 1e-7
+warmup-updates: 4000
+lr: 5e-4
+adam_betas: (0.9,0.98)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.3
+
+activation-fn: relu
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 1024
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 1024
+decoder-attention-heads: 4
+
+load-pretrained-encoder-from:
+load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/mt/conf/base_s.yaml
+++ b/egs/libri_trans/mt/conf/base_s.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: transformer
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -28,19 +11,22 @@ adam_betas: (0.9,0.997)
 criterion: label_smoothed_cross_entropy
 label_smoothing: 0.1

-dropout: 0.1
-attention-dropout: 0.1
-activation-dropout: 0.1
+dropout: 0.3
+attention-dropout: 0.0
+activation-dropout: 0.0

 activation-fn: relu
 encoder-normalize-before: True
 decoder-normalize-before: True
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 1024
 encoder-layers: 6
 decoder-layers: 6
 encoder-attention-heads: 4

-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 1024
 decoder-attention-heads: 4
+
+load-pretrained-encoder-from:
+load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt14/mt/conf/basis.yaml
+++ b/egs/iwslt14/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+
+max-epoch: 50
+max-update: 50000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/libri_trans/mt/conf/dlcl.yaml
+++ b/egs/libri_trans/mt/conf/dlcl.yaml
--- a/egs/libri_trans/mt/conf/rpr.yaml
+++ b/egs/libri_trans/mt/conf/rpr.yaml
@@ -2,4 +2,4 @@
 encoder-attention-type: relative
 decoder-attention-type: relative
 max-encoder-relative-length: 20
-max-decoder-relative-length: 20
\ No newline at end of file
+max-decoder-relative-length: 20
--- a/egs/libri_trans/mt/decode.sh
+++ b/egs/libri_trans/mt/decode.sh
@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi

+sacrebleu=0
 n_average=10
 beam_size=5
 len_penalty=1.0
@@ -21,6 +22,7 @@ cmd="./run.sh
    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
    --n_average ${n_average}
    --beam_size ${beam_size}
    --len_penalty ${len_penalty}

--- a/egs/libri_trans/mt/local/lower_rm.py
+++ b/egs/libri_trans/mt/local/lower_rm.py
--- a/egs/libri_trans/mt/local/monitor.sh
+++ b/egs/libri_trans/mt/local/monitor.sh
--- a/egs/libri_trans/mt/local/parse_options.sh
+++ b/egs/libri_trans/mt/local/parse_options.sh
--- a/egs/libri_trans/mt/local/utils.sh
+++ b/egs/libri_trans/mt/local/utils.sh
--- a/egs/libri_trans/mt/run.sh
+++ b/egs/libri_trans/mt/run.sh
@@ -13,7 +13,7 @@ set -o pipefail
 export PYTHONIOENCODING=UTF-8

 eval=1
-time=$(date "+%m%d_%H%M")
+time=$(date "+%m%d")

 stage=0
 stop_stage=0
@@ -24,33 +24,34 @@ device=()
 gpu_num=8
 update_freq=1

-root_dir=~/st/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
-src_lang=en
-tgt_lang=de
+src_lang=de
+tgt_lang=en
 lang=${src_lang}-${tgt_lang}

-dataset=mt
+dataset=iwslt14
 task=translation
 vocab_type=unigram
 vocab_size=10000
 share_dict=1
 lcrm=0
-tokenizer=0
+tokenizer=1

 use_specific_dict=0
 specific_prefix=st
-specific_dir=/home/xuchen/st/data/mustc/st/en-de/
+specific_dir=${root_dir}/data/mustc/st/en-de/
 src_vocab_prefix=spm_unigram10000_st_share
 tgt_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/mt/${lang}
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/mt
 train_subset=train
 valid_subset=dev
-trans_subset=tst-COMMON
+trans_subset=test
 test_subset=test

 # exp
@@ -70,6 +71,7 @@ step_valid=0
 bleu_valid=0

 # decoding setting
+sacrebleu=0
 dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
@@ -80,13 +82,19 @@ if [[ ${use_specific_dict} -eq 1 ]]; then
    data_dir=${data_dir}/${specific_prefix}
    mkdir -p ${data_dir}
 else
-    data_dir=${data_dir}/${vocab_type}${vocab_size}
-    src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
-    tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
+    if [[ "${vocab_type}" == "char" ]]; then
+        vocab_name=${vocab_type}
+        exp_prefix=${exp_prefix}_${vocab_type}
+    else
+        vocab_name=${vocab_type}${vocab_size}
+    fi
+    data_dir=${data_dir}/${vocab_name}
+    src_vocab_prefix=spm_${vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
    if [[ $share_dict -eq 1 ]]; then
        data_dir=${data_dir}_share
-        src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
-        tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
+        src_vocab_prefix=spm_${vocab_name}_share
+        tgt_vocab_prefix=spm_${vocab_name}_share
    fi
 fi
 if [[ ${lcrm} -eq 1 ]]; then
@@ -111,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
+model_dir=$code_dir/../checkpoints/$dataset/mt/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -127,7 +135,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
        if [[ ${use_specific_dict} -eq 0 ]]; then
-            cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
+            cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
                --data-root ${org_data_dir}
                --output-root ${data_dir}
                --splits ${train_subset},${valid_subset},${trans_subset}
@@ -150,9 +158,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    mkdir -p ${data_dir}/data
    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
    {
-        cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
+        if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
+            txt_dir=${org_data_dir}/data/${split}/txt
+        else
+            txt_dir=${org_data_dir}/data/${split}
+        fi
+        cmd="cat ${txt_dir}/${split}.${src_lang}"
        if [[ ${lcrm} -eq 1 ]]; then
-            cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
+            cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
        fi
        cmd="${cmd}
        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
@@ -165,7 +178,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        cmd="spm_encode
        --model ${data_dir}/${tgt_vocab_prefix}.model
        --output_format=piece
-        < ${org_data_dir}/${lang}/data/${split}.${tgt_lang}
+        < ${txt_dir}/${split}.${tgt_lang}
        > ${data_dir}/data/${split}.${tgt_lang}"

        echo -e "\033[34mRun command: \n${cmd} \033[0m"
@@ -174,7 +187,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    done
    wait

-    cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    cmd="python ${code_dir}/fairseq_cli/preprocess.py
        --source-lang ${src_lang} --target-lang ${tgt_lang}
        --trainpref ${data_dir}/data/${train_subset}
        --validpref ${data_dir}/data/${valid_subset}
@@ -214,28 +227,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --source-lang ${src_lang}
        --target-lang ${tgt_lang}
@@ -263,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    if [[ $step_valid -eq 1 ]]; then
        validate_interval=1
        save_interval=1
-        keep_last_epochs=10
        no_epoch_checkpoints=0
        save_interval_updates=500
        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
    fi
    if [[ $bleu_valid -eq 1 ]]; then
        cmd="$cmd
@@ -292,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        cmd="${cmd}
        --save-interval $save_interval "
    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
    if [[ -n $save_interval_updates ]]; then
        cmd="${cmd}
        --save-interval-updates $save_interval_updates"
@@ -329,9 +332,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt

-		cmd="python ${root_dir}/scripts/average_checkpoints.py
+		cmd="python ${code_dir}/scripts/average_checkpoints.py
        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
+        --num-best-checkpoints ${n_average}
        --output ${model_dir}/${dec_model}"
    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
    	[[ $eval -eq 1 ]] && eval $cmd
@@ -354,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

    test_subset=(${test_subset//,/ })
 	for subset in ${test_subset[@]}; do
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --source-lang ${src_lang}
        --target-lang ${tgt_lang}
@@ -365,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --max-tokens ${max_tokens}
        --beam ${beam_size}
        --lenpen ${len_penalty}
-        --post-process sentencepiece
-        --scoring sacrebleu"
+        --post-process sentencepiece"

-        if [[ ${tokenizer} -eq 1 ]]; then
+        if [[ ${sacrebleu} -eq 1 ]]; then
            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="${cmd}
        --tokenizer moses
        --moses-source-lang ${src_lang}
        --moses-target-lang ${tgt_lang}"
+            fi
        fi

    	echo -e "\033[34mRun command: \n${cmd} \033[0m"

--- a/egs/libri_trans/mt/train.sh
+++ b/egs/libri_trans/mt/train.sh
--- a/egs/libri_trans/asr/conf/pds_base_16.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_16.yaml
-arch: pdss2t_transformer_s_16
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/pds_base_32.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_32.yaml
-arch: pdss2t_transformer_s_32
-
-encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/pds_base_8.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_8.yaml
-arch: pdss2t_transformer_s_8
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/st/binary.sh
+++ b/egs/libri_trans/st/binary.sh
-set -e
-
-eval=1
-
-lcrm=1
-tokenizer=0
-
-root_dir=~/st/Fairseq-S2T
-data_dir=/home/xuchen/st/data/test
-vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
-
-src_lang=en
-tgt_lang=de
-splits=(2019)
-
-splits=$(echo ${splits[*]} | sed 's/ /_/g')
-
-cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
-
-cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-    --data-root ${data_dir}
-    --output-root ${data_dir}
-    --splits ${splits}
-    --task st
-    --src-lang ${src_lang}
-    --tgt-lang ${tgt_lang}
-    --add-src
-    --share
-    --asr-prefix ${asr_vocab_prefix}
-    --st-spm-prefix ${st_vocab_prefix}
-    --cmvn-type utterance"
-
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-    --lowercase-src
-    --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-    --tokenizer"
-    fi
-
-echo -e "\033[34mRun command: \n${cmd} \033[0m"
-[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/libri_trans/st/conf/conformer.yaml
+++ b/egs/libri_trans/st/conf/conformer.yaml
-macaron-style: True
-use-cnn-module: True
-cnn-module-kernel: 31
--- a/egs/libri_trans/st/conf/ctc.yaml
+++ b/egs/libri_trans/st/conf/ctc.yaml
-ctc-weight: 0.3
\ No newline at end of file
--- a/egs/libri_trans/st/conf/dlcl.yaml
+++ b/egs/libri_trans/st/conf/dlcl.yaml
-use-enc-dlcl: True
-use-dec-dlcl: True
--- a/egs/libri_trans/st/conf/local_attn.yaml
+++ b/egs/libri_trans/st/conf/local_attn.yaml
-encoder-attention-type: local
-hard-mask-window: 0
-gauss-mask-sigma: 3
-init-mask-weight: 0
\ No newline at end of file
--- a/egs/libri_trans/st/conf/pds_base_16.yaml
+++ b/egs/libri_trans/st/conf/pds_base_16.yaml
-arch: pdss2t_transformer_s_16
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/st/conf/pds_base_32.yaml
+++ b/egs/libri_trans/st/conf/pds_base_32.yaml
-arch: pdss2t_transformer_s_32
-
-encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/st/conf/pds_base_8.yaml
+++ b/egs/libri_trans/st/conf/pds_base_8.yaml
-arch: pdss2t_transformer_s_8
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/st/conf/rpr.yaml
+++ b/egs/libri_trans/st/conf/rpr.yaml
-encoder-attention-type: rel_selfattn
-#encoder-attention-type: relative
-#decoder-attention-type: relative
-#max-encoder-relative-length: 100
-#max-decoder-relative-length: 20
--- a/egs/libri_trans/st/conf/sate_ctc.yaml
+++ b/egs/libri_trans/st/conf/sate_ctc.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-acoustic-encoder-from:
-#load-pretrained-text-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: s2t_sate
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-ctc-weight: 0.3
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-encoder-normalize-before: True
-decoder-normalize-before: True
-conv-kernel-sizes: 5,5
-conv-channels: 1024
-dropout: 0.1
-activation-fn: relu
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-text-encoder-layers: 6
-decoder-layers: 6
-encoder-attention-heads: 4
-
-#macaron-style: True
-#use-cnn-module: True
-#cnn-module-kernel: 31
-
-#acoustic-encoder: pds
-acoustic-encoder: transformer
-adapter: league
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/libri_trans/st/decode.sh
+++ b/egs/libri_trans/st/decode.sh
-#! /bin/bash
-
-gpu_num=1
-
-data_dir=
-test_subset=(tst-COMMON)
-
-exp_name=
-if [ "$#" -eq 1 ]; then
-    exp_name=$1
-fi
-
-n_average=10
-beam_size=5
-len_penalty=1.0
-max_tokens=80000
-dec_model=checkpoint_best.pt
-
-cmd="./run.sh
-    --stage 2
-    --stop_stage 2
-    --gpu_num ${gpu_num}
-    --exp_name ${exp_name}
-    --n_average ${n_average}
-    --beam_size ${beam_size}
-    --len_penalty ${len_penalty}
-    --max_tokens ${max_tokens}
-    --dec_model ${dec_model}
-    "
-
-if [[ -n ${data_dir} ]]; then
-    cmd="$cmd --data_dir ${data_dir}"
-fi
-if [[ ${#test_subset[@]} -eq 0 ]]; then
-    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
-    cmd="$cmd --test_subset ${subsets}"
-fi
-
-echo $cmd
-eval $cmd
--- a/egs/libri_trans/st/ensemble.sh
+++ b/egs/libri_trans/st/ensemble.sh
-set -e
-
-gpu_num=1
-root_dir=/home/xuchen/st/Fairseq-S2T
-ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
-
-model_txt=$1
-set=$2
-test_subset=$3
-
-#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
-#test_subset=(tst-COMMON)
-
-data_dir=/media/data/tst/$set/en-de
-#test_subset=(office)
-#test_subset=(webrtc1)
-#test_subset=(adap2)
-
-data_config=config_st_share.yaml
-result_file=./result
-
-beam_size=5
-lenpen=0.6
-max_tokens=10000
-
-models=()
-i=0
-for line in `cat $model_txt`; do
-    i=`expr $i + 1`
-    
-    model_dir=$ckpt/$line
-    [[ ! -d $model_dir ]] && echo $model_dir && exit 1;
-
-    if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
-        model=$model_dir/avg_10_checkpoint.pt
-    else
-        model=$model_dir/checkpoint_best.pt
-    fi
-    [[ ! -f $model ]] && echo $model && exit 1;
-
-    models[$i]=$model
-done
-
-models=`echo ${models[*]} | sed 's/ /:/g'`
-
-res_dir=$ckpt/ensemble/$set
-i=0
-while : 
-do
-    if [[ -d $res_dir/$i ]]; then
-        i=`expr $i + 1`
-    else
-        res_dir=$res_dir/$i
-        break
-    fi 
-done
-
-mkdir -p $res_dir
-cp $model_txt $res_dir
-
-
-if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-    if [[ ${gpu_num} -eq 0 ]]; then
-        device=()
-    else
-        source ./local/utils.sh
-        device=$(get_devices $gpu_num 0)
-    fi
-fi
-export CUDA_VISIBLE_DEVICES=${device}
-
-for subset in ${test_subset[@]}; do
-    subset=${subset}_st
-    cmd="python ${root_dir}/fairseq_cli/generate.py
-    ${data_dir}
-    --config-yaml ${data_config}
-    --gen-subset ${subset}
-    --task speech_to_text
-    --path ${models}
-    --results-path ${res_dir}
-    --skip-invalid-size-inputs-valid-test
-    --max-tokens ${max_tokens}
-    --beam ${beam_size}
-    --lenpen ${lenpen}
-    --scoring sacrebleu"
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    eval $cmd
-    tail -n 1 ${res_dir}/generate-${subset}.txt
-
-    cd $res_dir
-    evaluate.sh translation-${subset}.txt $set
-    cd -
-done
-
--- a/egs/libri_trans/st/local/monitor.sh
+++ b/egs/libri_trans/st/local/monitor.sh
-gpu_num=4
-cmd="sh train.sh"
-
-while :
-do
-    record=$(mktemp -t temp.record.XXXXXX)
-    gpustat > $record
-    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-    count=0
-    for dev in ${all_devices[@]}
-    do
-        line=$((dev + 2))
-        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-
-        if [[ $use -lt 100 ]]; then
-            device[$count]=$dev
-            count=$((count + 1))
-            if [[ $count -eq $gpu_num ]]; then
-                break
-            fi
-        fi
-    done
-    if [[ ${#device[@]} -lt $gpu_num ]]; then
-        sleep 60s
-    else
-        echo "Run $cmd"
-        eval $cmd
-        sleep 10s
-        exit
-    fi
-done
--- a/egs/libri_trans/st/local/parse_options.sh
+++ b/egs/libri_trans/st/local/parse_options.sh
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
--- a/egs/libri_trans/st/local/utils.sh
+++ b/egs/libri_trans/st/local/utils.sh
-
-get_devices(){
-    gpu_num=$1
-    use_cpu=$2
-    device=()
-    while :
-    do
-        record=$(mktemp -t temp.record.XXXXXX)
-        gpustat > $record
-        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-        count=0
-        for dev in ${all_devices[@]}
-        do
-            line=$((dev + 2))
-            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-            if [[ $use -lt 100 ]]; then
-                device[$count]=$dev
-                count=$((count + 1))
-                if [[ $count -eq $gpu_num ]]; then
-                    break
-                fi
-            fi
-        done
-        if [[ ${#device[@]} -lt $gpu_num ]]; then
-            if [[ $use_cpu -eq 1 ]]; then
-                device=(-1)
-            else
-                sleep 60s
-            fi
-        else
-            break
-        fi
-    done
-
-    echo ${device[*]} | sed 's/ /,/g'
-    return $?
-}
-
-
--- a/egs/libri_trans/st/run.sh
+++ b/egs/libri_trans/st/run.sh
-#! /bin/bash
-
-# Processing MuST-C Datasets
-
-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-#set -u
-set -o pipefail
-export PYTHONIOENCODING=UTF-8
-
-eval=1
-time=$(date "+%m%d_%H%M")
-
-stage=0
-stop_stage=0
-
-######## hardware ########
-# devices
-#device=()
-gpu_num=8
-update_freq=1
-
-root_dir=~/st/Fairseq-S2T
-pwd_dir=$PWD
-
-# dataset
-src_lang=en
-tgt_lang=de
-lang=${src_lang}-${tgt_lang}
-
-dataset=st
-task=speech_to_text
-vocab_type=unigram
-asr_vocab_size=5000
-vocab_size=10000
-share_dict=1
-speed_perturb=0
-lcrm=0
-tokenizer=0
-
-use_specific_dict=0
-specific_prefix=valid
-specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
-
-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/st
-test_subset=tst-COMMON
-
-# exp
-exp_prefix=$(date "+%m%d")
-extra_tag=
-extra_parameter=
-exp_tag=baseline
-exp_name=
-
-# config
-train_config=ctc
-
-# training setting
-fp16=1
-max_tokens=40000
-step_valid=0
-bleu_valid=0
-
-# decoding setting
-dec_model=checkpoint_best.pt
-n_average=10
-beam_size=5
-len_penalty=1.0
-
-if [[ ${share_dict} -eq 1 ]]; then
-	data_config=config_st_share.yaml
-else
-	data_config=config_st.yaml
-fi
-if [[ ${speed_perturb} -eq 1 ]]; then
-    data_dir=${data_dir}_sp
-    exp_prefix=${exp_prefix}_sp
-fi
-if [[ ${lcrm} -eq 1 ]]; then
-    data_dir=${data_dir}_lcrm
-    exp_prefix=${exp_prefix}_lcrm
-fi
-if [[ ${use_specific_dict} -eq 1 ]]; then
-    data_dir=${data_dir}_${specific_prefix}
-    exp_prefix=${exp_prefix}_${specific_prefix}
-fi
-if [[ ${tokenizer} -eq 1 ]]; then
-    data_dir=${data_dir}_tok
-    exp_prefix=${exp_prefix}_tok
-fi
-
-. ./local/parse_options.sh || exit 1;
-
-if [[ -z ${exp_name} ]]; then
-    config_string=${train_config//,/_}
-    exp_name=${exp_prefix}_${config_string}_${exp_tag}
-    if [[ -n ${extra_tag} ]]; then
-        exp_name=${exp_name}_${extra_tag}
-    fi
-fi
-model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    ### Task dependent. You have to make data the following preparation part by yourself.
-    ### But you can utilize Kaldi recipes in most cases
-    echo "stage 0: ASR Data Preparation"
-    if [[ ! -e ${data_dir}/${lang} ]]; then
-        mkdir -p ${data_dir}/${lang}
-    fi
-
-    cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
-        --data-root ${org_data_dir}
-        --output-root ${data_dir}
-        --task asr
-        --vocab-type ${vocab_type}
-        --vocab-size ${asr_vocab_size}"
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
-    asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
-
-    echo "stage 0: ST Data Preparation"
-    cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-        --data-root ${org_data_dir}
-        --output-root ${data_dir}
-        --task st
-        --add-src
-        --cmvn-type utterance
-        --vocab-type ${vocab_type}
-        --vocab-size ${vocab_size}"
-
-    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
-        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
-        if [[ $share_dict -eq 1 ]]; then
-            cmd="$cmd
-        --share
-        --st-spm-prefix ${st_vocab_prefix}"
-        else
-            cmd="$cmd
-        --st-spm-prefix ${st_vocab_prefix}
-        --asr-prefix ${asr_vocab_prefix}"
-        fi
-    else
-        if [[ $share_dict -eq 1 ]]; then
-            cmd="$cmd
-        --share"
-        else
-            cmd="$cmd
-        --asr-prefix ${asr_prefix}"
-        fi
-    fi
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-        --lowercase-src
-        --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-        --tokenizer"
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-fi
-
-data_dir=${data_dir}/${lang}
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "stage 1: ST Network Training"
-    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-
-    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
-
-    if [[ ! -d ${model_dir} ]]; then
-        mkdir -p ${model_dir}
-    else
-        echo "${model_dir} exists."
-    fi
-
-    cp ${BASH_SOURCE[0]} ${model_dir}
-    cp ${PWD}/train.sh ${model_dir}
-
-    config_list="${train_config//,/ }"
-    idx=0
-    for config in ${config_list[@]}
-    do
-        config_path=$pwd_dir/conf/${config}.yaml
-        if [[ ! -f ${config_path} ]]; then
-            echo "No config file ${config_path}"
-            exit
-        fi
-        cp ${config_path} ${model_dir}
-
-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
-        --train-config${idx} ${config_path}"
-        fi
-        idx=$((idx + 1))
-    done
-
-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --task ${task}
-        --max-tokens ${max_tokens}
-        --skip-invalid-size-inputs-valid-test
-        --update-freq ${update_freq}
-        --log-interval 100
-        --save-dir ${model_dir}
-        --tensorboard-logdir ${model_dir}"
-
-	if [[ -n ${extra_parameter} ]]; then
-        cmd="${cmd}
-        ${extra_parameter}"
-    fi
-	if [[ ${gpu_num} -gt 0 ]]; then
-		cmd="${cmd}
-        --distributed-world-size $gpu_num
-        --ddp-backend no_c10d"
-	fi
-    if [[ $fp16 -eq 1 ]]; then
-        cmd="${cmd}
-        --fp16"
-    fi
-    if [[ $step_valid -eq 1 ]]; then
-        validate_interval=1
-        save_interval=1
-        keep_last_epochs=10
-        no_epoch_checkpoints=0
-        save_interval_updates=500
-        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
-    fi
-    if [[ $bleu_valid -eq 1 ]]; then
-        cmd="$cmd
-        --eval-bleu
-        --eval-bleu-args '{\"beam\": 1}'
-        --eval-tokenized-bleu
-        --eval-bleu-remove-bpe
-        --best-checkpoint-metric bleu
-        --maximize-best-checkpoint-metric"
-    fi
-    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
-        cmd="$cmd
-        --no-epoch-checkpoints"
-    fi
-    if [[ -n $validate_interval ]]; then
-        cmd="${cmd}
-        --validate-interval $validate_interval "
-    fi
-    if [[ -n $save_interval ]]; then
-        cmd="${cmd}
-        --save-interval $save_interval "
-    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
-    if [[ -n $save_interval_updates ]]; then
-        cmd="${cmd}
-        --save-interval-updates $save_interval_updates"
-        if [[ -n $keep_interval_updates ]]; then
-        cmd="${cmd}
-        --keep-interval-updates $keep_interval_updates"
-        fi
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    # save info
-    log=./history.log
-    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
-    tail -n 50 ${log} > tmp.log
-    mv tmp.log $log
-    export CUDA_VISIBLE_DEVICES=${device}
-
-    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
-    if [[ $eval -eq 1 ]]; then
-		eval $cmd
-		sleep 2s
-		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
-	fi
-fi
-wait
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: ST Decoding"
-    if [[ ${n_average} -ne 1 ]]; then
-        # Average models
-		dec_model=avg_${n_average}_checkpoint.pt
-
-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
-	else
-		dec_model=${dec_model}
-	fi
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-    export CUDA_VISIBLE_DEVICES=${device}
-
-	result_file=${model_dir}/decode_result
-	[[ -f ${result_file} ]] && rm ${result_file}
-
-    test_subset=${test_subset//,/ }
-	for subset in "${test_subset[@]}"; do
-        subset=${subset}_st
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --gen-subset ${subset}
-        --task speech_to_text
-        --path ${model_dir}/${dec_model}
-        --results-path ${model_dir}
-        --max-tokens ${max_tokens}
-        --beam ${beam_size}
-        --lenpen ${len_penalty}
-        --scoring sacrebleu"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-        if [[ $eval -eq 1 ]]; then
-    	    eval $cmd
-    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
-        fi
-	done
-    cat ${result_file}
-fi
--- a/egs/libri_trans/st/train.sh
+++ b/egs/libri_trans/st/train.sh
-#! /bin/bash
-
-# training the model
-
-gpu_num=8
-update_freq=1
-max_tokens=40000
-
-extra_tag=
-extra_parameter=
-#extra_tag="${extra_tag}"
-#extra_parameter="${extra_parameter} "
-
-exp_tag=
-
-#config_list=(base)
-config_list=(ctc)
-#config_list=(sate_ctc)
-#config_list=(ctc conformer rpr)
-#config_list=(base sate)
-
-#config_list=(pds_base)
-#config_list=(pds_base conformer)
-
-# exp full name
-exp_name=
-
-train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
-
-cmd="./run.sh
-    --stage 1
-    --stop_stage 1
-    --gpu_num ${gpu_num}
-    --update_freq ${update_freq}
-    --train_config ${train_config}
-    --max_tokens ${max_tokens}
-    "
-
-if [[ -n ${exp_name} ]]; then
-    cmd="$cmd --exp_name ${exp_name}"
-fi
-if [[ -n ${exp_tag} ]]; then
-    cmd="$cmd --exp_tag ${exp_tag}"
-fi
-if [[ -n ${extra_tag} ]]; then
-    cmd="$cmd --extra_tag ${extra_tag}"
-fi
-if [[ -n ${extra_parameter} ]]; then
-    cmd="$cmd --extra_parameter \"${extra_parameter}\""
-fi
-
-echo ${cmd}
-eval ${cmd}
--- a/egs/librispeech/asr/conf/base.yaml
+++ b/egs/librispeech/asr/conf/base.yaml
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_transformer_s
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -39,4 +23,4 @@ encoder-attention-heads: 4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/basis.yaml
+++ b/egs/librispeech/asr/conf/basis.yaml
+train-subset: train-clean-100,train-clean-360,train-other-500
+valid-subset: dev-clean
+
+max-epoch: 100
+max-update: 300000
+patience: 20
+best-checkpoint-metric: loss
+maximize-best-checkpoint-metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/librispeech/asr/conf/big.yaml
+++ b/egs/librispeech/asr/conf/big.yaml
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 20
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_transformer_m
 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/debug.yaml
+++ b/egs/librispeech/asr/conf/debug.yaml
@@ -2,19 +2,19 @@
 #arch: s2t_transformer_s
 arch: s2t_sate
 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 cl-dropout: True
 cl-dropout-epoch: 50

--- a/egs/librispeech/asr/conf/debug2.yaml
+++ b/egs/librispeech/asr/conf/debug2.yaml
-train-subset: train-clean-100
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-acoustic-encoder-from:
-#load-pretrained-text-encoder-from:
-#load-pretrained-decoder-from:
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_ctc_baseline/avg_10_checkpoint.pt
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_ctc_conformer_lr0.001/avg_10_checkpoint.pt
-
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1007_st_pyramid4_all256_3333_sr8_ctc/avg_10_checkpoint.pt
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1114_st_pyramid4_all256_ctc_fix/avg_10_checkpoint.pt
-
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1015_st_pyramid4_all256_conformer_baseline/avg_10_checkpoint.pt
-#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/1111_st_pyramid4_all256_conformer_ctc/avg_10_checkpoint.pt
-
-arch: s2t_sate
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-text-encoder-layers: 6
-decoder-layers: 6
-encoder-attention-heads: 4
-
-#macaron-style: True
-#use-cnn-module: True
-#cnn-module-kernel: 31
-
-#acoustic-encoder: transformer
-#acoustic-encoder: conformer
-acoustic-encoder: pyramid
-adapter: league
-#adapter: none
-#adapter: context
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-sr-ratios: 2_2_1_2
-pyramid-embed-dims: 256_256_256_256
-pyramid-fuse: True
-pyramid-reduced-embed: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-heads: 4_4_4_4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_base.yaml
+++ b/egs/librispeech/asr/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0
@@ -37,4 +20,4 @@ encoder-attention-heads: 4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_base_16.yaml
+++ b/egs/librispeech/asr/conf/pds_base_16.yaml
 arch: pdss2t_transformer_s_16

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -52,4 +36,4 @@ encoder-attention-heads: 4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_base_32.yaml
+++ b/egs/librispeech/asr/conf/pds_base_32.yaml
 arch: pdss2t_transformer_s_32

 encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 5
+#pds-dropout: 0
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -52,4 +36,4 @@ encoder-attention-heads: 4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_base_8.yaml
+++ b/egs/librispeech/asr/conf/pds_base_8.yaml
 arch: pdss2t_transformer_s_8

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -52,4 +36,4 @@ encoder-attention-heads: 4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_big.yaml
+++ b/egs/librispeech/asr/conf/pds_big.yaml
@@ -2,22 +2,6 @@ arch: pdss2t_transformer_m_8
 #arch: pdss2t_transformer_m_16
 #arch: pdss2t_transformer_m_32

-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0

--- a/egs/librispeech/asr/conf/pds_big_16.yaml
+++ b/egs/librispeech/asr/conf/pds_big_16.yaml
 arch: pdss2t_transformer_m_16

 encoder-embed-dim: 512
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 512_512_512_512
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 4_4_4_4
-pyramid-attn-heads: 8_8_8_8
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/pds_big_32.yaml
+++ b/egs/librispeech/asr/conf/pds_big_32.yaml
 arch: pdss2t_transformer_m_32

 encoder-embed-dim: 512
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 512_512_512_512_512
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 4_4_4_4_4
-pyramid-attn-heads: 8_8_8_8_8
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 5
+#pds-dropout: 0
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 4_4_4_4_4
+pds-attn-heads: 8_8_8_8_8

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/pds_big_8.yaml
+++ b/egs/librispeech/asr/conf/pds_big_8.yaml
 arch: pdss2t_transformer_m_8

 encoder-embed-dim: 512
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 512_512_512_512
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 4_4_4_4
-pyramid-attn-heads: 8_8_8_8
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/pds_deep.yaml
+++ b/egs/librispeech/asr/conf/pds_deep.yaml
@@ -2,22 +2,6 @@ arch: pdss2t_transformer_sd_8
 #arch: pdss2t_transformer_sd_16
 #arch: pdss2t_transformer_sd_32

-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0

--- a/egs/librispeech/asr/conf/pds_deep_16.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_16.yaml
 arch: pdss2t_transformer_sd_16

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 5_5_12_8
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 5_5_12_8
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/pds_deep_32.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_32.yaml
 arch: pdss2t_transformer_sd_32

 encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 5_5_7_7_6
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 5
+#pds-dropout: 0
+pds-layers: 5_5_7_7_6
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/conf/pds_deep_8.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_8.yaml
 arch: pdss2t_transformer_sd_8

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 7_7_7_9
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train-clean-100,train-clean-360,train-other-500
-valid-subset: dev-clean
-
-max-epoch: 100
-max-update: 300000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 7_7_7_9
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/librispeech/asr/run.sh
+++ b/egs/librispeech/asr/run.sh
@@ -24,7 +24,8 @@ device=()
 gpu_num=8
 update_freq=1

-root_dir=~/st/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
@@ -42,8 +43,8 @@ specific_prefix=valid
 specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
 asr_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}
 test_subset=dev-clean,dev-other,test-clean,test-other

 # exp
@@ -81,13 +82,12 @@ fi

 if [[ -z ${exp_name} ]]; then
    config_string=${train_config//,/_}
-#    exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
    exp_name=${exp_prefix}_${config_string}_${exp_tag}
    if [[ -n ${extra_tag} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
+model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -103,7 +103,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        mkdir -p ${data_dir}
    fi

-    cmd="python ${root_dir}/examples/speech_to_text/prep_librispeech_data.py
+    cmd="python ${code_dir}/examples/speech_to_text/prep_librispeech_data.py
        --data-root ${org_data_dir}
        --output-root ${data_dir}
        --vocab-type ${vocab_type}
@@ -146,28 +146,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --config-yaml ${data_config}
        --task ${task}
@@ -252,9 +250,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 		dec_model=avg_${n_average}_checkpoint.pt

        if [[ ! -f ${model_dir}/${dec_model} ]]; then
-            cmd="python ${root_dir}/scripts/average_checkpoints.py
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
            --inputs ${model_dir}
-            --num-epoch-checkpoints ${n_average}
+            --num-best-checkpoints ${n_average}
            --output ${model_dir}/${dec_model}"
            echo -e "\033[34mRun command: \n${cmd} \033[0m"
            [[ $eval -eq 1 ]] && eval $cmd
@@ -279,7 +277,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    test_subset=(${test_subset//,/ })
 	for subset in ${test_subset[@]}; do
        subset=${subset}
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --config-yaml ${data_config}
        --gen-subset ${subset}

--- a/egs/mustc/asr/conf/base.yaml
+++ b/egs/mustc/asr/conf/base.yaml
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_transformer_s
 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/asr/conf/basis.yaml
+++ b/egs/mustc/asr/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/mustc/asr/conf/ctc.yaml
+++ b/egs/mustc/asr/conf/ctc.yaml
 ctc-weight: 0.3
+post-process: sentencepiece
--- a/egs/mustc/asr/conf/pds_base.yaml
+++ b/egs/mustc/asr/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-fusion: True
+ctc-layer: 12

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/asr/conf/pds_base_16.yaml
+++ b/egs/mustc/asr/conf/pds_base_16.yaml
 arch: pdss2t_transformer_s_16

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/asr/conf/pds_base_32.yaml
+++ b/egs/mustc/asr/conf/pds_base_32.yaml
 arch: pdss2t_transformer_s_32

 encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/asr/conf/pds_base_8.yaml
+++ b/egs/mustc/asr/conf/pds_base_8.yaml
 arch: pdss2t_transformer_s_8

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -24,7 +24,8 @@ stop_stage=0
 gpu_num=8
 update_freq=1

-root_dir=~/st/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
@@ -41,13 +42,16 @@ lcrm=0
 tokenizer=0
 use_raw_audio=0

-use_specific_dict=0
+use_specific_dict=1
 specific_prefix=st
-specific_dir=/home/xuchen/st/data/mustc/st/en-de
+specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/asr
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
+train_split=train
+valid_split=dev
+test_split=tst-COMMON
 test_subset=tst-COMMON

 # exp
@@ -59,7 +63,7 @@ exp_name=

 # config
 train_config=ctc
-data_config=config_asr.yaml
+data_config=config.yaml

 # training setting
 fp16=1
@@ -97,13 +101,12 @@ fi

 if [[ -z ${exp_name} ]]; then
    config_string=${train_config//,/_}
-#    exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
    exp_name=${exp_prefix}_${config_string}_${exp_tag}
    if [[ -n ${extra_tag} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
+model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -114,11 +117,23 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 0: ASR Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    feature_zip=fbank80.zip
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
+    fi
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
+        ln -s ${data_dir}/../feature_zip ${data_dir}
+    fi

-    cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
        --data-root ${org_data_dir}
        --output-root ${data_dir}
        --task asr
+        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
        --vocab-type ${vocab_type}
        --vocab-size ${vocab_size}"

@@ -127,7 +142,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --raw"
    fi
    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
        cmd="$cmd
        --asr-prefix ${asr_vocab_prefix}"
    fi
@@ -147,6 +162,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval ${cmd}
+
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
+    fi
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
@@ -173,28 +193,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --config-yaml ${data_config}
        --task ${task}
@@ -278,12 +296,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt

-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
 	else
 		dec_model=${dec_model}
 	fi
@@ -303,8 +323,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

    test_subset=${test_subset//,/ }
 	for subset in ${test_subset[@]}; do
-        subset=${subset}_asr
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --config-yaml ${data_config}
        --gen-subset ${subset}

--- a/egs/mustc/mt/conf/base.yaml
+++ b/egs/mustc/mt/conf/base.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: transformer
 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/mt/conf/base_s.yaml
+++ b/egs/mustc/mt/conf/base_s.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: transformer
 share-decoder-input-output-embed: True
 optimizer: adam

--- a/egs/mustc/mt/conf/basis.yaml
+++ b/egs/mustc/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+
+max-epoch: 50
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/mustc/mt/decode.sh
+++ b/egs/mustc/mt/decode.sh
@@ -10,6 +10,7 @@ if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi

+sacrebleu=1
 n_average=10
 beam_size=5
 len_penalty=1.0
@@ -21,6 +22,7 @@ cmd="./run.sh
    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
    --n_average ${n_average}
    --beam_size ${beam_size}
    --len_penalty ${len_penalty}

--- a/egs/mustc/mt/run.sh
+++ b/egs/mustc/mt/run.sh
@@ -13,7 +13,7 @@ set -o pipefail
 export PYTHONIOENCODING=UTF-8

 eval=1
-time=$(date "+%m%d_%H%M")
+time=$(date "+%m%d")

 stage=0
 stop_stage=0
@@ -24,7 +24,8 @@ device=()
 gpu_num=8
 update_freq=1

-root_dir=~/st/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
@@ -42,12 +43,12 @@ tokenizer=0

 use_specific_dict=0
 specific_prefix=st
-specific_dir=/home/xuchen/st/data/mustc/st/en-de/
+specific_dir=${root_dir}/data/mustc/st
 src_vocab_prefix=spm_unigram10000_st_share
 tgt_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/mt/${lang}
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/mt
 train_subset=train
 valid_subset=dev
 trans_subset=tst-COMMON
@@ -70,6 +71,7 @@ step_valid=0
 bleu_valid=0

 # decoding setting
+sacrebleu=1
 dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
@@ -106,7 +108,6 @@ fi
 # full path
 if [[ -z ${exp_name} ]]; then
    config_string=${train_config//,/_}
-#    exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
    exp_name=${exp_prefix}_${config_string}_${exp_tag}
    if [[ -n ${extra_tag} ]]; then
        exp_name=${exp_name}_${extra_tag}
@@ -128,7 +129,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
        if [[ ${use_specific_dict} -eq 0 ]]; then
-            cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
+            cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
                --data-root ${org_data_dir}
                --output-root ${data_dir}
                --splits ${train_subset},${valid_subset},${trans_subset}
@@ -151,9 +152,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    mkdir -p ${data_dir}/data
    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
    {
-        cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
+        txt_dir=${org_data_dir}/data/${split}/txt
+        cmd="cat ${txt_dir}/${split}.${src_lang}"
        if [[ ${lcrm} -eq 1 ]]; then
-            cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
+            cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
        fi
        cmd="${cmd}
        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
@@ -166,7 +168,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        cmd="spm_encode
        --model ${data_dir}/${tgt_vocab_prefix}.model
        --output_format=piece
-        < ${org_data_dir}/${lang}/data/${split}/txt/${split}.${tgt_lang}
+        < ${txt_dir}/${split}.${tgt_lang}
        > ${data_dir}/data/${split}.${tgt_lang}"

        echo -e "\033[34mRun command: \n${cmd} \033[0m"
@@ -175,7 +177,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    done
    wait

-    cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    cmd="python ${code_dir}/fairseq_cli/preprocess.py
        --source-lang ${src_lang} --target-lang ${tgt_lang}
        --trainpref ${data_dir}/data/${train_subset}
        --validpref ${data_dir}/data/${valid_subset}
@@ -215,28 +217,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --source-lang ${src_lang}
        --target-lang ${tgt_lang}
@@ -330,12 +330,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt

-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
 	else
 		dec_model=${dec_model}
 	fi
@@ -355,7 +357,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

    test_subset=(${test_subset//,/ })
 	for subset in ${test_subset[@]}; do
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --source-lang ${src_lang}
        --target-lang ${tgt_lang}
@@ -366,14 +368,17 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --max-tokens ${max_tokens}
        --beam ${beam_size}
        --lenpen ${len_penalty}
-        --post-process sentencepiece
-        --scoring sacrebleu"
+        --post-process sentencepiece"

-        if [[ ${tokenizer} -eq 1 ]]; then
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
            cmd="${cmd}
        --tokenizer moses
        --moses-source-lang ${src_lang}
        --moses-target-lang ${tgt_lang}"
+            fi
        fi

    	echo -e "\033[34mRun command: \n${cmd} \033[0m"

--- a/egs/mustc/st/conf/base.yaml
+++ b/egs/mustc/st/conf/base.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_transformer_s
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -42,3 +26,6 @@ decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
 attention-dropout: 0.1
 activation-dropout: 0.1
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/basis.yaml
+++ b/egs/mustc/st/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/mustc/st/conf/ctc.yaml
+++ b/egs/mustc/st/conf/ctc.yaml
-ctc-weight: 0.3
\ No newline at end of file
+ctc-weight: 0.3
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/mustc/st/conf/pds_base.yaml
+++ b/egs/mustc/st/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-fusion: True
+ctc-layer: 12

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -38,3 +24,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/pds_base_16.yaml
+++ b/egs/mustc/st/conf/pds_base_16.yaml
 arch: pdss2t_transformer_s_16

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -53,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/pds_base_32.yaml
+++ b/egs/mustc/st/conf/pds_base_32.yaml
 arch: pdss2t_transformer_s_32

 encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -53,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/pds_base_8.yaml
+++ b/egs/mustc/st/conf/pds_base_8.yaml
 arch: pdss2t_transformer_s_8

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 share-decoder-input-output-embed: True
 optimizer: adam
@@ -53,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/sate_ctc.yaml
+++ b/egs/mustc/st/conf/sate_ctc.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-acoustic-encoder-from:
-#load-pretrained-text-encoder-from:
-#load-pretrained-decoder-from:
-
 arch: s2t_sate
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -43,6 +25,11 @@ text-encoder-layers: 6
 decoder-layers: 6
 encoder-attention-heads: 4

+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
+
 #macaron-style: True
 #use-cnn-module: True
 #cnn-module-kernel: 31
@@ -52,20 +39,20 @@ acoustic-encoder: transformer
 adapter: league

 encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4

 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/mustc/st/decode.sh
+++ b/egs/mustc/st/decode.sh
@@ -3,13 +3,14 @@
 gpu_num=1

 data_dir=
-test_subset=(tst-COMMON)
+test_subset=(dev tst-COMMON)

 exp_name=
 if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi

+sacrebleu=1
 n_average=10
 beam_size=5
 len_penalty=1.0
@@ -21,6 +22,7 @@ cmd="./run.sh
    --stop_stage 2
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
    --n_average ${n_average}
    --beam_size ${beam_size}
    --len_penalty ${len_penalty}
@@ -31,7 +33,7 @@ cmd="./run.sh
 if [[ -n ${data_dir} ]]; then
    cmd="$cmd --data_dir ${data_dir}"
 fi
-if [[ ${#test_subset[@]} -eq 0 ]]; then
+if [[ ${#test_subset[@]} -ne 0 ]]; then
    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
    cmd="$cmd --test_subset ${subsets}"
 fi

--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -24,7 +24,8 @@ stop_stage=0
 gpu_num=8
 update_freq=1

-root_dir=~/st/Fairseq-S2T
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
 pwd_dir=$PWD

 # dataset
@@ -41,15 +42,19 @@ share_dict=1
 speed_perturb=0
 lcrm=0
 tokenizer=0
+use_raw_audio=0

 use_specific_dict=0
 specific_prefix=valid
-specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
+specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share
 st_vocab_prefix=spm_unigram10000_st_share

-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/st
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/st
+train_split=train
+valid_split=dev
+test_split=tst-COMMON
 test_subset=tst-COMMON

 # exp
@@ -60,7 +65,7 @@ exp_tag=baseline
 exp_name=

 # config
-train_config=ctc
+train_config=base,ctc

 # training setting
 fp16=1
@@ -69,15 +74,16 @@ step_valid=0
 bleu_valid=0

 # decoding setting
+sacrebleu=1
 dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
 len_penalty=1.0

 if [[ ${share_dict} -eq 1 ]]; then
-	data_config=config_st_share.yaml
+	data_config=config_share.yaml
 else
-	data_config=config_st.yaml
+	data_config=config.yaml
 fi
 if [[ ${speed_perturb} -eq 1 ]]; then
    data_dir=${data_dir}_sp
@@ -95,18 +101,21 @@ if [[ ${tokenizer} -eq 1 ]]; then
    data_dir=${data_dir}_tok
    exp_prefix=${exp_prefix}_tok
 fi
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    data_dir=${data_dir}_raw
+    exp_prefix=${exp_prefix}_raw
+fi

 . ./local/parse_options.sh || exit 1;

 if [[ -z ${exp_name} ]]; then
    config_string=${train_config//,/_}
-#    exp_name=${exp_prefix}_$(basename ${train_config%.*})_${exp_tag}
    exp_name=${exp_prefix}_${config_string}_${exp_tag}
    if [[ -n ${extra_tag} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
+model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -117,37 +126,49 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 0: ASR Data Preparation"
-    if [[ ! -e ${data_dir}/${lang} ]]; then
-        mkdir -p ${data_dir}/${lang}
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
+        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
+        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
    fi

-    cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
+    # create ASR vocabulary if necessary
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
        --data-root ${org_data_dir}
-        --output-root ${data_dir}
+        --output-root ${data_dir}/asr4st
        --task asr
+        --raw
+        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
        --vocab-type ${vocab_type}
        --vocab-size ${asr_vocab_size}"
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
+    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
    asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr

    echo "stage 0: ST Data Preparation"
-    cmd="python ${root_dir}/examples/speech_to_text/prep_mustc_data.py
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
        --data-root ${org_data_dir}
        --output-root ${data_dir}
        --task st
        --add-src
+        --src-lang ${src_lang}
+        --tgt-lang ${tgt_lang}
+        --splits ${valid_split},${test_split},${train_split}
        --cmvn-type utterance
        --vocab-type ${vocab_type}
        --vocab-size ${vocab_size}"

+    if [[ ${use_raw_audio} -eq 1 ]]; then
+        cmd="$cmd
+        --raw"
+    fi
    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
-        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
        if [[ $share_dict -eq 1 ]]; then
            cmd="$cmd
        --share
@@ -182,9 +203,16 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then

    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval ${cmd}
-fi

-data_dir=${data_dir}/${lang}
+    if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
+        mv ${data_dir}/fbank80.zip ${data_dir}/..
+        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
+        mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
+        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
+    fi
+fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: ST Network Training"
@@ -210,28 +238,26 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    cp ${BASH_SOURCE[0]} ${model_dir}
    cp ${PWD}/train.sh ${model_dir}

+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
    config_list="${train_config//,/ }"
-    idx=0
+    idx=1
    for config in ${config_list[@]}
    do
-        config_path=$pwd_dir/conf/${config}.yaml
+        config_path=${pwd_dir}/conf/${config}.yaml
        if [[ ! -f ${config_path} ]]; then
            echo "No config file ${config_path}"
            exit
        fi
        cp ${config_path} ${model_dir}

-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
+        extra_parameter="${extra_parameter}
        --train-config${idx} ${config_path}"
-        fi
        idx=$((idx + 1))
    done

-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
        ${data_dir}
        --config-yaml ${data_config}
        --task ${task}
@@ -324,12 +350,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt

-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
 	else
 		dec_model=${dec_model}
 	fi
@@ -348,9 +376,9 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
 	[[ -f ${result_file} ]] && rm ${result_file}

    test_subset=${test_subset//,/ }
-	for subset in "${test_subset[@]}"; do
-        subset=${subset}_st
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
+	for subset in ${test_subset[@]}; do
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
        ${data_dir}
        --config-yaml ${data_config}
        --gen-subset ${subset}
@@ -359,8 +387,19 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --results-path ${model_dir}
        --max-tokens ${max_tokens}
        --beam ${beam_size}
-        --lenpen ${len_penalty}
+        --lenpen ${len_penalty}"
+
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+            cmd="${cmd}
+        --tokenizer moses
+        --moses-source-lang ${src_lang}
+        --moses-target-lang ${tgt_lang}"
+            fi
+        fi
+
    	echo -e "\033[34mRun command: \n${cmd} \033[0m"

        if [[ $eval -eq 1 ]]; then

--- a/egs/template/asr/binary.sh
+++ b/egs/template/asr/binary.sh
-set -e
-
-eval=1
-
-lcrm=0
-tokenizer=0
-
-root_dir=~/st/Fairseq-S2T
-data_dir=~/st/data/test
-vocab_dir=~/st/data/mustc/st/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-
-src_lang=en
-tgt_lang=de
-subsets=(2019)
-
-cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
-
-splits=$(echo ${subsets[*]} | sed 's/ /,/g')
-cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-    --data-root ${data_dir}
-    --output-root ${data_dir}
-    --splits ${splits}
-    --task asr
-    --src-lang ${src_lang}
-    --tgt-lang ${tgt_lang}
-    --add-src
-    --share
-    --asr-prefix ${asr_vocab_prefix}
-    --cmvn-type utterance"
-
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-    --lowercase-src
-    --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-    --tokenizer"
-    fi
-
-echo -e "\033[34mRun command: \n${cmd} \033[0m"
-[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/template/asr/conf/base.yaml
+++ b/egs/template/asr/conf/base.yaml
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: s2t_transformer_s
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-conv-kernel-sizes: 5,5
-conv-channels: 1024
-dropout: 0.1
-activation-fn: relu
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
-attention-dropout: 0.1
-activation-dropout: 0.1
--- a/egs/template/asr/conf/conformer.yaml
+++ b/egs/template/asr/conf/conformer.yaml
-macaron-style: True
-use-cnn-module: True
-cnn-module-kernel: 31
--- a/egs/template/asr/conf/ctc.yaml
+++ b/egs/template/asr/conf/ctc.yaml
-ctc-weight: 0.3
--- a/egs/template/asr/conf/dlcl.yaml
+++ b/egs/template/asr/conf/dlcl.yaml
-use-enc-dlcl: True
-use-dec-dlcl: True
--- a/egs/template/asr/conf/local_attn.yaml
+++ b/egs/template/asr/conf/local_attn.yaml
-encoder-attention-type: local
-hard-mask-window: 0
-gauss-mask-sigma: 3
-init-mask-weight: 0
\ No newline at end of file
--- a/egs/template/asr/conf/pds_base_16.yaml
+++ b/egs/template/asr/conf/pds_base_16.yaml
-arch: pdss2t_transformer_s_16
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/asr/conf/pds_base_32.yaml
+++ b/egs/template/asr/conf/pds_base_32.yaml
-arch: pdss2t_transformer_s_32
-
-encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/asr/conf/pds_base_8.yaml
+++ b/egs/template/asr/conf/pds_base_8.yaml
-arch: pdss2t_transformer_s_8
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_asr
-valid-subset: dev_asr
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/asr/conf/rpr.yaml
+++ b/egs/template/asr/conf/rpr.yaml
-encoder-attention-type: rel_selfattn
-#encoder-attention-type: relative
-#max-encoder-relative-length: 100
--- a/egs/template/asr/decode.sh
+++ b/egs/template/asr/decode.sh
-#! /bin/bash
-
-gpu_num=1
-
-data_dir=
-test_subset=(test)
-
-exp_name=
-if [ "$#" -eq 1 ]; then
-    exp_name=$1
-fi
-
-n_average=10
-beam_size=5
-len_penalty=1.0
-max_tokens=80000
-dec_model=checkpoint_best.pt
-
-cmd="./run.sh
-    --stage 2
-    --stop_stage 2
-    --gpu_num ${gpu_num}
-    --exp_name ${exp_name}
-    --n_average ${n_average}
-    --beam_size ${beam_size}
-    --len_penalty ${len_penalty}
-    --max_tokens ${max_tokens}
-    --dec_model ${dec_model}
-    "
-
-if [[ -n ${data_dir} ]]; then
-    cmd="$cmd --data_dir ${data_dir}"
-fi
-if [[ ${#test_subset[@]} -ne 0 ]]; then
-    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
-    cmd="$cmd --test_subset ${subsets}"
-fi
-
-echo $cmd
-eval $cmd
--- a/egs/template/asr/local/monitor.sh
+++ b/egs/template/asr/local/monitor.sh
-gpu_num=4
-cmd="sh train.sh"
-
-while :
-do
-    record=$(mktemp -t temp.record.XXXXXX)
-    gpustat > $record
-    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-    count=0
-    for dev in ${all_devices[@]}
-    do
-        line=$((dev + 2))
-        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-
-        if [[ $use -lt 100 ]]; then
-            device[$count]=$dev
-            count=$((count + 1))
-            if [[ $count -eq $gpu_num ]]; then
-                break
-            fi
-        fi
-    done
-    if [[ ${#device[@]} -lt $gpu_num ]]; then
-        sleep 60s
-    else
-        echo "Run $cmd"
-        eval $cmd
-        sleep 10s
-        exit
-    fi
-done
--- a/egs/template/asr/local/parse_options.sh
+++ b/egs/template/asr/local/parse_options.sh
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
--- a/egs/template/asr/local/utils.sh
+++ b/egs/template/asr/local/utils.sh
-
-get_devices(){
-    gpu_num=$1
-    use_cpu=$2
-    device=()
-    while :
-    do
-        record=$(mktemp -t temp.record.XXXXXX)
-        gpustat > $record
-        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-        count=0
-        for dev in ${all_devices[@]}
-        do
-            line=$((dev + 2))
-            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-            if [[ $use -lt 100 ]]; then
-                device[$count]=$dev
-                count=$((count + 1))
-                if [[ $count -eq $gpu_num ]]; then
-                    break
-                fi
-            fi
-        done
-        if [[ ${#device[@]} -lt $gpu_num ]]; then
-            if [[ $use_cpu -eq 1 ]]; then
-                device=(-1)
-            else
-                sleep 60s
-            fi
-        else
-            break
-        fi
-    done
-
-    echo ${device[*]} | sed 's/ /,/g'
-    return $?
-}
-
-
--- a/egs/template/asr/run.sh
+++ b/egs/template/asr/run.sh
-#! /bin/bash
-
-# Processing ASR Datasets
-
-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-#set -u
-set -o pipefail
-export PYTHONIOENCODING=UTF-8
-
-eval=1
-time=$(date "+%m%d_%H%M")
-
-stage=0
-stop_stage=0
-
-######## hardware ########
-# devices
-#device=()
-gpu_num=8
-update_freq=1
-
-root_dir=~/st/Fairseq-S2T
-pwd_dir=$PWD
-
-# dataset
-src_lang=en
-lang=${src_lang}
-
-dataset=asr
-task=speech_to_text
-vocab_type=unigram
-vocab_size=5000
-speed_perturb=0
-lcrm=1
-tokenizer=0
-
-use_specific_dict=0
-specific_prefix=st
-specific_dir=/home/xuchen/st/data/mustc/st/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-
-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/asr
-train_split=train
-valid_split=dev
-test_split=test
-test_subset=test
-
-# exp
-exp_prefix=$(date "+%m%d")
-extra_tag=
-extra_parameter=
-exp_tag=baseline
-exp_name=
-
-# config
-train_config=ctc
-data_config=config_asr.yaml
-
-# training setting
-fp16=1
-max_tokens=40000
-step_valid=0
-
-# decoding setting
-dec_model=checkpoint_best.pt
-n_average=10
-beam_size=5
-len_penalty=1.0
-
-if [[ ${speed_perturb} -eq 1 ]]; then
-    data_dir=${data_dir}_sp
-    exp_prefix=${exp_prefix}_sp
-fi
-if [[ ${lcrm} -eq 1 ]]; then
-    data_dir=${data_dir}_lcrm
-    exp_prefix=${exp_prefix}_lcrm
-fi
-if [[ ${use_specific_dict} -eq 1 ]]; then
-    data_dir=${data_dir}_${specific_prefix}
-    exp_prefix=${exp_prefix}_${specific_prefix}
-fi
-if [[ ${tokenizer} -eq 1 ]]; then
-    data_dir=${data_dir}_tok
-    exp_prefix=${exp_prefix}_tok
-fi
-
-. ./local/parse_options.sh || exit 1;
-
-if [[ -z ${exp_name} ]]; then
-    config_string=${train_config//,/_}
-    exp_name=${exp_prefix}_${config_string}_${exp_tag}
-    if [[ -n ${extra_tag} ]]; then
-        exp_name=${exp_name}_${extra_tag}
-    fi
-fi
-model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    ### Task dependent. You have to make data the following preparation part by yourself.
-    ### But you can utilize Kaldi recipes in most cases
-    echo "stage 0: ASR Data Preparation"
-    if [[ ! -e ${data_dir} ]]; then
-        mkdir -p ${data_dir}
-    fi
-
-    cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
-        --data-root ${org_data_dir}
-        --output-root ${data_dir}
-        --task asr
-        --splits ${train_split},${valid_split},${test_split}
-        --lang ${lang}
-        --vocab-type ${vocab_type}
-        --vocab-size ${vocab_size}"
-
-    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
-        cmd="$cmd
-        --asr-prefix ${asr_vocab_prefix}"
-    fi
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-        --lowercase-src
-        --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-        --tokenizer"
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-fi
-
-data_dir=${data_dir}/${lang}
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "stage 1: ASR Network Training"
-    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-
-    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
-
-    if [[ ! -d ${model_dir} ]]; then
-        mkdir -p ${model_dir}
-    else
-        echo "${model_dir} exists."
-    fi
-
-    cp ${BASH_SOURCE[0]} ${model_dir}
-    cp ${PWD}/train.sh ${model_dir}
-
-    config_list="${train_config//,/ }"
-    idx=0
-    for config in ${config_list[@]}
-    do
-        config_path=$pwd_dir/conf/${config}.yaml
-        if [[ ! -f ${config_path} ]]; then
-            echo "No config file ${config_path}"
-            exit
-        fi
-        cp ${config_path} ${model_dir}
-
-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
-        --train-config${idx} ${config_path}"
-        fi
-        idx=$((idx + 1))
-    done
-
-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --task ${task}
-        --max-tokens ${max_tokens}
-        --skip-invalid-size-inputs-valid-test
-        --update-freq ${update_freq}
-        --log-interval 100
-        --save-dir ${model_dir}
-        --tensorboard-logdir ${model_dir}"
-
-	if [[ -n ${extra_parameter} ]]; then
-        cmd="${cmd}
-        ${extra_parameter}"
-    fi
-	if [[ ${gpu_num} -gt 0 ]]; then
-		cmd="${cmd}
-        --distributed-world-size $gpu_num
-        --ddp-backend no_c10d"
-	fi
-    if [[ $fp16 -eq 1 ]]; then
-        cmd="${cmd}
-        --fp16"
-    fi
-    if [[ $step_valid -eq 1 ]]; then
-        validate_interval=1
-        save_interval=1
-        keep_last_epochs=10
-        no_epoch_checkpoints=0
-        save_interval_updates=500
-        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
-    fi
-    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
-        cmd="$cmd
-        --no-epoch-checkpoints"
-    fi
-    if [[ -n $validate_interval ]]; then
-        cmd="${cmd}
-        --validate-interval $validate_interval "
-    fi
-    if [[ -n $save_interval ]]; then
-        cmd="${cmd}
-        --save-interval $save_interval "
-    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
-    if [[ -n $save_interval_updates ]]; then
-        cmd="${cmd}
-        --save-interval-updates $save_interval_updates"
-        if [[ -n $keep_interval_updates ]]; then
-        cmd="${cmd}
-        --keep-interval-updates $keep_interval_updates"
-        fi
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    # save info
-    log=./history.log
-    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
-    tail -n 50 ${log} > tmp.log
-    mv tmp.log $log
-    export CUDA_VISIBLE_DEVICES=${device}
-
-    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
-    if [[ $eval -eq 1 ]]; then
-		eval $cmd
-		sleep 2s
-		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
-	fi
-fi
-wait
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: ASR Decoding"
-    if [[ ${n_average} -ne 1 ]]; then
-        # Average models
-		dec_model=avg_${n_average}_checkpoint.pt
-
-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
-	else
-		dec_model=${dec_model}
-	fi
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-    export CUDA_VISIBLE_DEVICES=${device}
-
-	result_file=${model_dir}/decode_result
-	[[ -f ${result_file} ]] && rm ${result_file}
-
-    test_subset=${test_subset//,/ }
-	for subset in ${test_subset[@]}; do
-        subset=${subset}_asr
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --gen-subset ${subset}
-        --task speech_to_text
-        --path ${model_dir}/${dec_model}
-        --results-path ${model_dir}
-        --max-tokens ${max_tokens}
-        --beam ${beam_size}
-        --lenpen ${len_penalty}
-        --scoring wer
-        --wer-tokenizer 13a
-        --wer-lowercase
-        --wer-remove-punct
-        "
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-        if [[ $eval -eq 1 ]]; then
-    	    eval $cmd
-    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
-        fi
-	done
-    cat ${result_file}
-fi
--- a/egs/template/asr/train.sh
+++ b/egs/template/asr/train.sh
-#! /bin/bash
-
-# training the model
-
-gpu_num=8
-update_freq=1
-max_tokens=40000
-
-
-extra_tag=
-extra_parameter=
-#extra_tag="${extra_tag}"
-#extra_parameter="${extra_parameter} "
-
-exp_tag=
-
-#config_list=(base)
-#config_list=(ctc)
-#config_list=(base conformer)
-
-#config_list=(pds_base_16)
-config_list=(pds_base_16 conformer rpr)
-
-# exp full name
-exp_name=
-
-train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
-
-cmd="./run.sh
-    --stage 1
-    --stop_stage 1
-    --gpu_num ${gpu_num}
-    --update_freq ${update_freq}
-    --train_config ${train_config}
-    --max_tokens ${max_tokens}
-    "
-
-if [[ -n ${exp_name} ]]; then
-    cmd="$cmd --exp_name ${exp_name}"
-fi
-if [[ -n ${exp_tag} ]]; then
-    cmd="$cmd --exp_tag ${exp_tag}"
-fi
-if [[ -n ${extra_tag} ]]; then
-    cmd="$cmd --extra_tag ${extra_tag}"
-fi
-if [[ -n ${extra_parameter} ]]; then
-    cmd="$cmd --extra_parameter \"${extra_parameter}\""
-fi
-
-echo ${cmd}
-eval ${cmd}
--- a/egs/template/mt/binary.sh
+++ b/egs/template/mt/binary.sh
-set -e
-
-eval=1
-
-root_dir=~/st/Fairseq-S2T
-data_dir=/home/xuchen/st/data/wmt/test
-vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
-src_vocab_prefix=spm_unigram32000_share
-tgt_vocab_prefix=spm_unigram32000_share
-
-src_lang=en
-tgt_lang=de
-tokenize=1
-splits=(newstest2014 newstest2016)
-
-for split in ${splits[@]}; do
-    src_file=${data_dir}/${split}.${src_lang}
-    tgt_file=${data_dir}/${split}.${tgt_lang}
-
-    if [[ ${tokenize} -eq 1 ]]; then
-        cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
-        echo -e "\033[34mRun command: \n${cmd} \033[0m"
-        [[ $eval -eq 1 ]] && eval ${cmd}
-
-        cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
-        echo -e "\033[34mRun command: \n${cmd} \033[0m"
-        [[ $eval -eq 1 ]] && eval ${cmd}
-        src_file=${src_file}.tok
-        tgt_file=${tgt_file}.tok
-    fi
-
-    cmd="cat ${src_file}"
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="python local/lower_rm.py ${src_file}"
-    fi
-    cmd="${cmd}
-    | spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
-    --output_format=piece
-    > ${src_file}.spm"
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-
-    cmd="spm_encode
-    --model ${vocab_dir}/${tgt_vocab_prefix}.model
-    --output_format=piece
-    < ${tgt_file}
-    > ${tgt_file}.spm"
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-
-    src_file=${src_file}.spm
-    tgt_file=${tgt_file}.spm
-
-    mkdir -p ${data_dir}/final
-    cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-
-    cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-done
-
-n_set=${#splits[*]}
-for ((i=0;i<$n_set;i++)); do
-    dataset[$i]=${data_dir}/final/${splits[$i]}
-done
-pref=`echo ${dataset[*]} | sed 's/ /,/g'`
-
-cmd="python ${root_dir}/fairseq_cli/preprocess.py
-    --source-lang ${src_lang}
-    --target-lang ${tgt_lang}
-    --testpref ${pref}
-    --destdir ${data_dir}/data-bin
-    --srcdict ${vocab_dir}/${src_vocab_prefix}.txt
-    --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
-    --workers 64"
-
-echo -e "\033[34mRun command: \n${cmd} \033[0m"
-[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
--- a/egs/template/mt/conf/base.yaml
+++ b/egs/template/mt/conf/base.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: transformer
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 8000
-lr: 1e-3
-adam_betas: (0.9,0.997)
-
-criterion: label_smoothed_cross_entropy
-label_smoothing: 0.1
-
-dropout: 0.1
-attention-dropout: 0.1
-activation-dropout: 0.1
-
-activation-fn: relu
-encoder-normalize-before: True
-decoder-normalize-before: True
-encoder-embed-dim: 512
-encoder-ffn-embed-dim: 2048
-encoder-layers: 6
-decoder-layers: 6
-encoder-attention-heads: 8
-
-decoder-embed-dim: 512
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 8
--- a/egs/template/mt/conf/base_s.yaml
+++ b/egs/template/mt/conf/base_s.yaml
-train-subset: train
-valid-subset: valid
-
-max-epoch: 50
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-skip-invalid-size-inputs-valid-test: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: transformer
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 8000
-lr: 1e-3
-adam_betas: (0.9,0.997)
-
-criterion: label_smoothed_cross_entropy
-label_smoothing: 0.1
-
-dropout: 0.1
-attention-dropout: 0.1
-activation-dropout: 0.1
-
-activation-fn: relu
-encoder-normalize-before: True
-decoder-normalize-before: True
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
-encoder-layers: 6
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/mt/conf/dlcl.yaml
+++ b/egs/template/mt/conf/dlcl.yaml
-use-enc-dlcl: True
-use-dec-dlcl: True
--- a/egs/template/mt/conf/rpr.yaml
+++ b/egs/template/mt/conf/rpr.yaml
-#encoder-attention-type: rel_selfattn
-encoder-attention-type: relative
-decoder-attention-type: relative
-max-encoder-relative-length: 20
-max-decoder-relative-length: 20
\ No newline at end of file
--- a/egs/template/mt/decode.sh
+++ b/egs/template/mt/decode.sh
-#! /bin/bash
-
-gpu_num=1
-
-data_dir=
-test_subset=(test)
-
-exp_name=
-if [ "$#" -eq 1 ]; then
-    exp_name=$1
-fi
-
-n_average=10
-beam_size=5
-len_penalty=1.0
-max_tokens=80000
-dec_model=checkpoint_best.pt
-
-cmd="./run.sh
-    --stage 2
-    --stop_stage 2
-    --gpu_num ${gpu_num}
-    --exp_name ${exp_name}
-    --n_average ${n_average}
-    --beam_size ${beam_size}
-    --len_penalty ${len_penalty}
-    --max_tokens ${max_tokens}
-    --dec_model ${dec_model}
-    "
-
-if [[ -n ${data_dir} ]]; then
-    cmd="$cmd --data_dir ${data_dir}"
-fi
-if [[ -n ${test_subset} ]]; then
-    test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
-    cmd="$cmd --test_subset ${test_subset}"
-fi
-
-echo $cmd
-eval $cmd
--- a/egs/template/mt/local/lower_rm.py
+++ b/egs/template/mt/local/lower_rm.py
-import sys
-import string
-
-
-in_file = sys.argv[1]
-
-with open(in_file, "r", encoding="utf-8") as f:
-    for line in f.readlines():
-        line = line.strip().lower()
-        for w in string.punctuation:
-            line = line.replace(w, "")
-        line = line.replace("  ", "")
-        print(line)
-
--- a/egs/template/mt/local/monitor.sh
+++ b/egs/template/mt/local/monitor.sh
-gpu_num=4
-cmd="sh train.sh"
-
-while :
-do
-    record=$(mktemp -t temp.record.XXXXXX)
-    gpustat > $record
-    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-    count=0
-    for dev in ${all_devices[@]}
-    do
-        line=$((dev + 2))
-        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-
-        if [[ $use -lt 100 ]]; then
-            device[$count]=$dev
-            count=$((count + 1))
-            if [[ $count -eq $gpu_num ]]; then
-                break
-            fi
-        fi
-    done
-    if [[ ${#device[@]} -lt $gpu_num ]]; then
-        sleep 60s
-    else
-        echo "Run $cmd"
-        eval $cmd
-        sleep 10s
-        exit
-    fi
-done
--- a/egs/template/mt/local/parse_options.sh
+++ b/egs/template/mt/local/parse_options.sh
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
--- a/egs/template/mt/local/utils.sh
+++ b/egs/template/mt/local/utils.sh
-
-get_devices(){
-    gpu_num=$1
-    use_cpu=$2
-    device=()
-    while :
-    do
-        record=$(mktemp -t temp.record.XXXXXX)
-        gpustat > $record
-        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-        count=0
-        for dev in ${all_devices[@]}
-        do
-            line=$((dev + 2))
-            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-            if [[ $use -lt 100 ]]; then
-                device[$count]=$dev
-                count=$((count + 1))
-                if [[ $count -eq $gpu_num ]]; then
-                    break
-                fi
-            fi
-        done
-        if [[ ${#device[@]} -lt $gpu_num ]]; then
-            if [[ $use_cpu -eq 1 ]]; then
-                device=(-1)
-            else
-                sleep 60s
-            fi
-        else
-            break
-        fi
-    done
-
-    echo ${device[*]} | sed 's/ /,/g'
-    return $?
-}
-
-
--- a/egs/template/mt/run.sh
+++ b/egs/template/mt/run.sh
-#! /bin/bash
-
-# Processing MuST-C Datasets
-
-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-#set -u
-set -o pipefail
-export PYTHONIOENCODING=UTF-8
-
-eval=1
-time=$(date "+%m%d_%H%M")
-
-stage=0
-stop_stage=0
-
-######## hardware ########
-# devices
-device=()
-gpu_num=8
-update_freq=1
-
-root_dir=~/st/Fairseq-S2T
-pwd_dir=$PWD
-
-# dataset
-src_lang=en
-tgt_lang=de
-lang=${src_lang}-${tgt_lang}
-
-dataset=mt
-task=translation
-vocab_type=unigram
-vocab_size=10000
-share_dict=1
-lcrm=0
-tokenizer=0
-
-use_specific_dict=0
-specific_prefix=st
-specific_dir=/home/xuchen/st/data/mustc/st/en-de/
-src_vocab_prefix=spm_unigram10000_st_share
-tgt_vocab_prefix=spm_unigram10000_st_share
-
-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/mt/${lang}
-train_subset=train
-valid_subset=dev
-trans_subset=tst-COMMON
-test_subset=test
-
-# exp
-exp_prefix=${time}
-extra_tag=
-extra_parameter=
-exp_tag=baseline
-exp_name=
-
-# config
-train_config=base_s
-
-# training setting
-fp16=1
-max_tokens=4096
-step_valid=0
-bleu_valid=0
-
-# decoding setting
-dec_model=checkpoint_best.pt
-n_average=10
-beam_size=5
-len_penalty=1.0
-
-if [[ ${use_specific_dict} -eq 1 ]]; then
-    exp_prefix=${specific_prefix}_${exp_prefix}
-    data_dir=${data_dir}/${specific_prefix}
-    mkdir -p ${data_dir}
-else
-    data_dir=${data_dir}/${vocab_type}${vocab_size}
-    src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
-    tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
-    if [[ $share_dict -eq 1 ]]; then
-        data_dir=${data_dir}_share
-        src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
-        tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
-    fi
-fi
-if [[ ${lcrm} -eq 1 ]]; then
-    data_dir=${data_dir}_lcrm
-    exp_prefix=${exp_prefix}_lcrm
-fi
-if [[ ${tokenizer} -eq 1 ]]; then
-    train_subset=${train_subset}.tok
-    valid_subset=${valid_subset}.tok
-    trans_subset=${trans_subset}.tok
-    data_dir=${data_dir}_tok
-    exp_prefix=${exp_prefix}_tok
-fi
-
-. ./local/parse_options.sh || exit 1;
-
-# full path
-if [[ -z ${exp_name} ]]; then
-    config_string=${train_config//,/_}
-    exp_name=${exp_prefix}_${config_string}_${exp_tag}
-    if [[ -n ${extra_tag} ]]; then
-        exp_name=${exp_name}_${extra_tag}
-    fi
-fi
-model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    ### Task dependent. You have to make data the following preparation part by yourself.
-    echo "stage 0: MT Data Preparation"
-    if [[ ! -e ${data_dir} ]]; then
-        mkdir -p ${data_dir}
-    fi
-
-    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
-        if [[ ${use_specific_dict} -eq 0 ]]; then
-            cmd="python ${root_dir}/examples/speech_to_text/prep_mt_data.py
-                --data-root ${org_data_dir}
-                --output-root ${data_dir}
-                --splits ${train_subset},${valid_subset},${trans_subset}
-                --src-lang ${src_lang}
-                --tgt-lang ${tgt_lang}
-                --vocab-type ${vocab_type}
-                --vocab-size ${vocab_size}"
-            if [[ $share_dict -eq 1 ]]; then
-                cmd="$cmd
-                --share"
-            fi
-            echo -e "\033[34mRun command: \n${cmd} \033[0m"
-            [[ $eval -eq 1 ]] && eval ${cmd}
-        else
-            cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
-            cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
-        fi
-    fi
-
-    mkdir -p ${data_dir}/data
-    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
-    {
-        cmd="cat ${org_data_dir}/${lang}/data/${split}/txt/${split}.${src_lang}"
-        if [[ ${lcrm} -eq 1 ]]; then
-            cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
-        fi
-        cmd="${cmd}
-        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
-        --output_format=piece
-        > ${data_dir}/data/${split}.${src_lang}"
-
-        echo -e "\033[34mRun command: \n${cmd} \033[0m"
-        [[ $eval -eq 1 ]] && eval ${cmd}
-
-        cmd="spm_encode
-        --model ${data_dir}/${tgt_vocab_prefix}.model
-        --output_format=piece
-        < ${org_data_dir}/${lang}/data/${split}.${tgt_lang}
-        > ${data_dir}/data/${split}.${tgt_lang}"
-
-        echo -e "\033[34mRun command: \n${cmd} \033[0m"
-        [[ $eval -eq 1 ]] && eval ${cmd}
-    }&
-    done
-    wait
-
-    cmd="python ${root_dir}/fairseq_cli/preprocess.py
-        --source-lang ${src_lang} --target-lang ${tgt_lang}
-        --trainpref ${data_dir}/data/${train_subset}
-        --validpref ${data_dir}/data/${valid_subset}
-        --testpref ${data_dir}/data/${trans_subset}
-        --destdir ${data_dir}/data-bin
-        --srcdict ${data_dir}/${src_vocab_prefix}.txt
-        --tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
-        --workers 64"
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-fi
-
-data_dir=${data_dir}/data-bin
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "stage 1: MT Network Training"
-    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-
-    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
-
-    if [[ ! -d ${model_dir} ]]; then
-        mkdir -p ${model_dir}
-    else
-        echo "${model_dir} exists."
-    fi
-
-    cp ${BASH_SOURCE[0]} ${model_dir}
-    cp ${PWD}/train.sh ${model_dir}
-
-    config_list="${train_config//,/ }"
-    idx=0
-    for config in ${config_list[@]}
-    do
-        config_path=$pwd_dir/conf/${config}.yaml
-        if [[ ! -f ${config_path} ]]; then
-            echo "No config file ${config_path}"
-            exit
-        fi
-        cp ${config_path} ${model_dir}
-
-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
-        --train-config${idx} ${config_path}"
-        fi
-        idx=$((idx + 1))
-    done
-
-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
-        ${data_dir}
-        --source-lang ${src_lang}
-        --target-lang ${tgt_lang}
-        --task ${task}
-        --max-tokens ${max_tokens}
-        --skip-invalid-size-inputs-valid-test
-        --update-freq ${update_freq}
-        --log-interval 100
-        --save-dir ${model_dir}
-        --tensorboard-logdir ${model_dir}"
-
-	if [[ -n ${extra_parameter} ]]; then
-        cmd="${cmd}
-        ${extra_parameter}"
-    fi
-	if [[ ${gpu_num} -gt 0 ]]; then
-		cmd="${cmd}
-        --distributed-world-size $gpu_num
-        --ddp-backend no_c10d"
-	fi
-    if [[ $fp16 -eq 1 ]]; then
-        cmd="${cmd}
-        --fp16"
-    fi
-    if [[ $step_valid -eq 1 ]]; then
-        validate_interval=1
-        save_interval=1
-        keep_last_epochs=10
-        no_epoch_checkpoints=0
-        save_interval_updates=500
-        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
-    fi
-    if [[ $bleu_valid -eq 1 ]]; then
-        cmd="$cmd
-        --eval-bleu
-        --eval-bleu-args '{\"beam\": 1}'
-        --eval-tokenized-bleu
-        --eval-bleu-remove-bpe
-        --best-checkpoint-metric bleu
-        --maximize-best-checkpoint-metric"
-    fi
-    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
-        cmd="$cmd
-        --no-epoch-checkpoints"
-    fi
-    if [[ -n $validate_interval ]]; then
-        cmd="${cmd}
-        --validate-interval $validate_interval "
-    fi
-    if [[ -n $save_interval ]]; then
-        cmd="${cmd}
-        --save-interval $save_interval "
-    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
-    if [[ -n $save_interval_updates ]]; then
-        cmd="${cmd}
-        --save-interval-updates $save_interval_updates"
-        if [[ -n $keep_interval_updates ]]; then
-        cmd="${cmd}
-        --keep-interval-updates $keep_interval_updates"
-        fi
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    # save info
-    log=./history.log
-    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
-    tail -n 50 ${log} > tmp.log
-    mv tmp.log $log
-    export CUDA_VISIBLE_DEVICES=${device}
-
-    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
-    if [[ $eval -eq 1 ]]; then
-		eval $cmd
-		sleep 2s
-		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
-	fi
-fi
-wait
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: MT Decoding"
-    if [[ ${n_average} -ne 1 ]]; then
-        # Average models
-		dec_model=avg_${n_average}_checkpoint.pt
-
-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
-	else
-		dec_model=${dec_model}
-	fi
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-    export CUDA_VISIBLE_DEVICES=${device}
-
-	result_file=${model_dir}/decode_result
-	[[ -f ${result_file} ]] && rm ${result_file}
-
-    test_subset=(${test_subset//,/ })
-	for subset in ${test_subset[@]}; do
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
-        ${data_dir}
-        --source-lang ${src_lang}
-        --target-lang ${tgt_lang}
-        --gen-subset ${subset}
-        --task ${task}
-        --path ${model_dir}/${dec_model}
-        --results-path ${model_dir}
-        --max-tokens ${max_tokens}
-        --beam ${beam_size}
-        --lenpen ${len_penalty}
-        --post-process sentencepiece
-        --scoring sacrebleu"
-
-        if [[ ${tokenizer} -eq 1 ]]; then
-            cmd="${cmd}
-        --tokenizer moses
-        --moses-source-lang ${src_lang}
-        --moses-target-lang ${tgt_lang}"
-        fi
-
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-        if [[ $eval -eq 1 ]]; then
-    	    eval $cmd
-    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
-        fi
-	done
-    cat ${result_file}
-fi
--- a/egs/template/mt/train.sh
+++ b/egs/template/mt/train.sh
-#! /bin/bash
-
-# training the model
-
-gpu_num=1
-update_freq=1
-max_tokens=8192
-
-exp_tag=baseline
-config_list=(base)
-
-# exp full name
-exp_name=
-
-extra_tag=
-extra_parameter=
-#extra_tag="${extra_tag}"
-#extra_parameter="${extra_parameter} "
-
-train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
-
-cmd="./run.sh
-    --stage 1
-    --stop_stage 1
-    --gpu_num ${gpu_num}
-    --update_freq ${update_freq}
-    --train_config ${train_config}
-    --max_tokens ${max_tokens}
-    "
-
-if [[ -n ${exp_name} ]]; then
-    cmd="$cmd --exp_name ${exp_name}"
-fi
-if [[ -n ${exp_tag} ]]; then
-    cmd="$cmd --exp_tag ${exp_tag}"
-fi
-if [[ -n ${extra_tag} ]]; then
-    cmd="$cmd --extra_tag ${extra_tag}"
-fi
-if [[ -n ${extra_parameter} ]]; then
-    cmd="$cmd --extra_parameter \"${extra_parameter}\""
-fi
-
-echo ${cmd}
-eval ${cmd}
--- a/egs/template/st/binary.sh
+++ b/egs/template/st/binary.sh
-set -e
-
-eval=1
-
-lcrm=1
-tokenizer=0
-
-root_dir=~/st/Fairseq-S2T
-data_dir=/home/xuchen/st/data/test
-vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
-
-src_lang=en
-tgt_lang=de
-splits=(2019)
-
-splits=$(echo ${splits[*]} | sed 's/ /_/g')
-
-cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
-
-cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-    --data-root ${data_dir}
-    --output-root ${data_dir}
-    --splits ${splits}
-    --task st
-    --src-lang ${src_lang}
-    --tgt-lang ${tgt_lang}
-    --add-src
-    --share
-    --asr-prefix ${asr_vocab_prefix}
-    --st-spm-prefix ${st_vocab_prefix}
-    --cmvn-type utterance"
-
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-    --lowercase-src
-    --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-    --tokenizer"
-    fi
-
-echo -e "\033[34mRun command: \n${cmd} \033[0m"
-[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/template/st/conf/base.yaml
+++ b/egs/template/st/conf/base.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: s2t_transformer_s
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-conv-kernel-sizes: 5,5
-conv-channels: 1024
-dropout: 0.1
-activation-fn: relu
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
-attention-dropout: 0.1
-activation-dropout: 0.1
--- a/egs/template/st/conf/conformer.yaml
+++ b/egs/template/st/conf/conformer.yaml
-macaron-style: True
-use-cnn-module: True
-cnn-module-kernel: 31
--- a/egs/template/st/conf/ctc.yaml
+++ b/egs/template/st/conf/ctc.yaml
-ctc-weight: 0.3
\ No newline at end of file
--- a/egs/template/st/conf/dlcl.yaml
+++ b/egs/template/st/conf/dlcl.yaml
-use-enc-dlcl: True
-use-dec-dlcl: True
--- a/egs/template/st/conf/local_attn.yaml
+++ b/egs/template/st/conf/local_attn.yaml
-encoder-attention-type: local
-hard-mask-window: 0
-gauss-mask-sigma: 3
-init-mask-weight: 0
\ No newline at end of file
--- a/egs/template/st/conf/pds_base_16.yaml
+++ b/egs/template/st/conf/pds_base_16.yaml
-arch: pdss2t_transformer_s_16
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 2_2_6_2
-pyramid-ratios: 2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/st/conf/pds_base_32.yaml
+++ b/egs/template/st/conf/pds_base_32.yaml
-arch: pdss2t_transformer_s_32
-
-encoder-embed-dim: 256
-pyramid-stages: 5
-#pyramid-dropout: 0
-pyramid-layers: 2_2_3_3_2
-pyramid-ratios: 2_2_2_2_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8_8
-pyramid-attn-heads: 4_4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/st/conf/pds_base_8.yaml
+++ b/egs/template/st/conf/pds_base_8.yaml
-arch: pdss2t_transformer_s_8
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
-
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-dropout: 0.1
-activation-fn: relu
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-decoder-layers: 6
-encoder-attention-heads: 4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/st/conf/rpr.yaml
+++ b/egs/template/st/conf/rpr.yaml
-encoder-attention-type: rel_selfattn
-#encoder-attention-type: relative
-#decoder-attention-type: relative
-#max-encoder-relative-length: 100
-#max-decoder-relative-length: 20
--- a/egs/template/st/conf/sate_ctc.yaml
+++ b/egs/template/st/conf/sate_ctc.yaml
-train-subset: train_st
-valid-subset: dev_st
-
-max-epoch: 100
-max-update: 100000
-
-num-workers: 8
-patience: 10
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-
-#load-pretrained-encoder-from:
-#load-pretrained-acoustic-encoder-from:
-#load-pretrained-text-encoder-from:
-#load-pretrained-decoder-from:
-
-arch: s2t_sate
-share-decoder-input-output-embed: True
-optimizer: adam
-clip-norm: 10.0
-lr-scheduler: inverse_sqrt
-warmup-init-lr: 1e-7
-warmup-updates: 10000
-lr: 2e-3
-#adam_betas: (0.9,0.98)
-
-ctc-weight: 0.3
-criterion: label_smoothed_cross_entropy_with_ctc
-label_smoothing: 0.1
-
-encoder-normalize-before: True
-decoder-normalize-before: True
-conv-kernel-sizes: 5,5
-conv-channels: 1024
-dropout: 0.1
-activation-fn: relu
-encoder-embed-dim: 256
-encoder-ffn-embed-dim: 2048
-encoder-layers: 12
-text-encoder-layers: 6
-decoder-layers: 6
-encoder-attention-heads: 4
-
-#macaron-style: True
-#use-cnn-module: True
-#cnn-module-kernel: 31
-
-#acoustic-encoder: pds
-acoustic-encoder: transformer
-adapter: league
-
-encoder-embed-dim: 256
-pyramid-stages: 4
-#pyramid-dropout: 0
-pyramid-layers: 3_3_3_3
-pyramid-ratios: 2_2_1_2
-pyramid-fusion: True
-pyramid-fusion-method: all_conv
-pyramid-embed-dims: 256_256_256_256
-pyramid-ds-method: conv
-pyramid-embed-norm: True
-pyramid-position-embed: 1_1_1_1
-pyramid-kernel-sizes: 5_5_5_5
-pyramid-ffn-ratios: 8_8_8_8
-pyramid-attn-heads: 4_4_4_4
-
-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
--- a/egs/template/st/decode.sh
+++ b/egs/template/st/decode.sh
-#! /bin/bash
-
-gpu_num=1
-
-data_dir=
-test_subset=(tst-COMMON)
-
-exp_name=
-if [ "$#" -eq 1 ]; then
-    exp_name=$1
-fi
-
-n_average=10
-beam_size=5
-len_penalty=1.0
-max_tokens=80000
-dec_model=checkpoint_best.pt
-
-cmd="./run.sh
-    --stage 2
-    --stop_stage 2
-    --gpu_num ${gpu_num}
-    --exp_name ${exp_name}
-    --n_average ${n_average}
-    --beam_size ${beam_size}
-    --len_penalty ${len_penalty}
-    --max_tokens ${max_tokens}
-    --dec_model ${dec_model}
-    "
-
-if [[ -n ${data_dir} ]]; then
-    cmd="$cmd --data_dir ${data_dir}"
-fi
-if [[ ${#test_subset[@]} -eq 0 ]]; then
-    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
-    cmd="$cmd --test_subset ${subsets}"
-fi
-
-echo $cmd
-eval $cmd
--- a/egs/template/st/ensemble.sh
+++ b/egs/template/st/ensemble.sh
-set -e
-
-gpu_num=1
-root_dir=/home/xuchen/st/Fairseq-S2T
-ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
-
-model_txt=$1
-set=$2
-test_subset=$3
-
-#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
-#test_subset=(tst-COMMON)
-
-data_dir=/media/data/tst/$set/en-de
-#test_subset=(office)
-#test_subset=(webrtc1)
-#test_subset=(adap2)
-
-data_config=config_st_share.yaml
-result_file=./result
-
-beam_size=5
-lenpen=0.6
-max_tokens=10000
-
-models=()
-i=0
-for line in `cat $model_txt`; do
-    i=`expr $i + 1`
-    
-    model_dir=$ckpt/$line
-    [[ ! -d $model_dir ]] && echo $model_dir && exit 1;
-
-    if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
-        model=$model_dir/avg_10_checkpoint.pt
-    else
-        model=$model_dir/checkpoint_best.pt
-    fi
-    [[ ! -f $model ]] && echo $model && exit 1;
-
-    models[$i]=$model
-done
-
-models=`echo ${models[*]} | sed 's/ /:/g'`
-
-res_dir=$ckpt/ensemble/$set
-i=0
-while : 
-do
-    if [[ -d $res_dir/$i ]]; then
-        i=`expr $i + 1`
-    else
-        res_dir=$res_dir/$i
-        break
-    fi 
-done
-
-mkdir -p $res_dir
-cp $model_txt $res_dir
-
-
-if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-    if [[ ${gpu_num} -eq 0 ]]; then
-        device=()
-    else
-        source ./local/utils.sh
-        device=$(get_devices $gpu_num 0)
-    fi
-fi
-export CUDA_VISIBLE_DEVICES=${device}
-
-for subset in ${test_subset[@]}; do
-    subset=${subset}_st
-    cmd="python ${root_dir}/fairseq_cli/generate.py
-    ${data_dir}
-    --config-yaml ${data_config}
-    --gen-subset ${subset}
-    --task speech_to_text
-    --path ${models}
-    --results-path ${res_dir}
-    --skip-invalid-size-inputs-valid-test
-    --max-tokens ${max_tokens}
-    --beam ${beam_size}
-    --lenpen ${lenpen}
-    --scoring sacrebleu"
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    eval $cmd
-    tail -n 1 ${res_dir}/generate-${subset}.txt
-
-    cd $res_dir
-    evaluate.sh translation-${subset}.txt $set
-    cd -
-done
-
--- a/egs/template/st/local/monitor.sh
+++ b/egs/template/st/local/monitor.sh
-gpu_num=4
-cmd="sh train.sh"
-
-while :
-do
-    record=$(mktemp -t temp.record.XXXXXX)
-    gpustat > $record
-    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-    count=0
-    for dev in ${all_devices[@]}
-    do
-        line=$((dev + 2))
-        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-
-        if [[ $use -lt 100 ]]; then
-            device[$count]=$dev
-            count=$((count + 1))
-            if [[ $count -eq $gpu_num ]]; then
-                break
-            fi
-        fi
-    done
-    if [[ ${#device[@]} -lt $gpu_num ]]; then
-        sleep 60s
-    else
-        echo "Run $cmd"
-        eval $cmd
-        sleep 10s
-        exit
-    fi
-done
--- a/egs/template/st/local/parse_options.sh
+++ b/egs/template/st/local/parse_options.sh
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
--- a/egs/template/st/local/utils.sh
+++ b/egs/template/st/local/utils.sh
-
-get_devices(){
-    gpu_num=$1
-    use_cpu=$2
-    device=()
-    while :
-    do
-        record=$(mktemp -t temp.record.XXXXXX)
-        gpustat > $record
-        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
-
-        count=0
-        for dev in ${all_devices[@]}
-        do
-            line=$((dev + 2))
-            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-            if [[ $use -lt 100 ]]; then
-                device[$count]=$dev
-                count=$((count + 1))
-                if [[ $count -eq $gpu_num ]]; then
-                    break
-                fi
-            fi
-        done
-        if [[ ${#device[@]} -lt $gpu_num ]]; then
-            if [[ $use_cpu -eq 1 ]]; then
-                device=(-1)
-            else
-                sleep 60s
-            fi
-        else
-            break
-        fi
-    done
-
-    echo ${device[*]} | sed 's/ /,/g'
-    return $?
-}
-
-
--- a/egs/template/st/run.sh
+++ b/egs/template/st/run.sh
-#! /bin/bash
-
-# Processing MuST-C Datasets
-
-# Copyright 2021 Natural Language Processing Laboratory 
-# Xu Chen (xuchenneu@163.com)
-
-# Set bash to 'debug' mode, it will exit on :
-# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
-set -e
-#set -u
-set -o pipefail
-export PYTHONIOENCODING=UTF-8
-
-eval=1
-time=$(date "+%m%d_%H%M")
-
-stage=0
-stop_stage=0
-
-######## hardware ########
-# devices
-#device=()
-gpu_num=8
-update_freq=1
-
-root_dir=~/st/Fairseq-S2T
-pwd_dir=$PWD
-
-# dataset
-src_lang=en
-tgt_lang=de
-lang=${src_lang}-${tgt_lang}
-
-dataset=st
-task=speech_to_text
-vocab_type=unigram
-asr_vocab_size=5000
-vocab_size=10000
-share_dict=1
-speed_perturb=0
-lcrm=0
-tokenizer=0
-
-use_specific_dict=0
-specific_prefix=valid
-specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
-asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
-
-org_data_dir=~/st/data/${dataset}
-data_dir=~/st/data/${dataset}/st
-test_subset=tst-COMMON
-
-# exp
-exp_prefix=$(date "+%m%d")
-extra_tag=
-extra_parameter=
-exp_tag=baseline
-exp_name=
-
-# config
-train_config=ctc
-
-# training setting
-fp16=1
-max_tokens=40000
-step_valid=0
-bleu_valid=0
-
-# decoding setting
-dec_model=checkpoint_best.pt
-n_average=10
-beam_size=5
-len_penalty=1.0
-
-if [[ ${share_dict} -eq 1 ]]; then
-	data_config=config_st_share.yaml
-else
-	data_config=config_st.yaml
-fi
-if [[ ${speed_perturb} -eq 1 ]]; then
-    data_dir=${data_dir}_sp
-    exp_prefix=${exp_prefix}_sp
-fi
-if [[ ${lcrm} -eq 1 ]]; then
-    data_dir=${data_dir}_lcrm
-    exp_prefix=${exp_prefix}_lcrm
-fi
-if [[ ${use_specific_dict} -eq 1 ]]; then
-    data_dir=${data_dir}_${specific_prefix}
-    exp_prefix=${exp_prefix}_${specific_prefix}
-fi
-if [[ ${tokenizer} -eq 1 ]]; then
-    data_dir=${data_dir}_tok
-    exp_prefix=${exp_prefix}_tok
-fi
-
-. ./local/parse_options.sh || exit 1;
-
-if [[ -z ${exp_name} ]]; then
-    config_string=${train_config//,/_}
-    exp_name=${exp_prefix}_${config_string}_${exp_tag}
-    if [[ -n ${extra_tag} ]]; then
-        exp_name=${exp_name}_${extra_tag}
-    fi
-fi
-model_dir=$root_dir/../checkpoints/$dataset/st/${exp_name}
-
-if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
-    echo "stage -1: Data Download"
-    # pass
-fi
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    ### Task dependent. You have to make data the following preparation part by yourself.
-    ### But you can utilize Kaldi recipes in most cases
-    echo "stage 0: ASR Data Preparation"
-    if [[ ! -e ${data_dir}/${lang} ]]; then
-        mkdir -p ${data_dir}/${lang}
-    fi
-
-    cmd="python ${root_dir}/examples/speech_to_text/prep_asr_data.py
-        --data-root ${org_data_dir}
-        --output-root ${data_dir}
-        --task asr
-        --vocab-type ${vocab_type}
-        --vocab-size ${asr_vocab_size}"
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && eval $cmd
-    asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
-
-    echo "stage 0: ST Data Preparation"
-    cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-        --data-root ${org_data_dir}
-        --output-root ${data_dir}
-        --task st
-        --add-src
-        --cmvn-type utterance
-        --vocab-type ${vocab_type}
-        --vocab-size ${vocab_size}"
-
-    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
-        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}/${lang}
-        if [[ $share_dict -eq 1 ]]; then
-            cmd="$cmd
-        --share
-        --st-spm-prefix ${st_vocab_prefix}"
-        else
-            cmd="$cmd
-        --st-spm-prefix ${st_vocab_prefix}
-        --asr-prefix ${asr_vocab_prefix}"
-        fi
-    else
-        if [[ $share_dict -eq 1 ]]; then
-            cmd="$cmd
-        --share"
-        else
-            cmd="$cmd
-        --asr-prefix ${asr_prefix}"
-        fi
-    fi
-    if [[ ${speed_perturb} -eq 1 ]]; then
-        cmd="$cmd
-        --speed-perturb"
-    fi
-    if [[ ${lcrm} -eq 1 ]]; then
-        cmd="$cmd
-        --lowercase-src
-        --rm-punc-src"
-    fi
-    if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
-        --tokenizer"
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    [[ $eval -eq 1 ]] && eval ${cmd}
-fi
-
-data_dir=${data_dir}/${lang}
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "stage 1: ST Network Training"
-    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-
-    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
-
-    if [[ ! -d ${model_dir} ]]; then
-        mkdir -p ${model_dir}
-    else
-        echo "${model_dir} exists."
-    fi
-
-    cp ${BASH_SOURCE[0]} ${model_dir}
-    cp ${PWD}/train.sh ${model_dir}
-
-    config_list="${train_config//,/ }"
-    idx=0
-    for config in ${config_list[@]}
-    do
-        config_path=$pwd_dir/conf/${config}.yaml
-        if [[ ! -f ${config_path} ]]; then
-            echo "No config file ${config_path}"
-            exit
-        fi
-        cp ${config_path} ${model_dir}
-
-        if [[ idx -eq 0 ]]; then
-            extra_parameter="${extra_parameter}
-        --train-config ${config_path}"
-        else
-            extra_parameter="${extra_parameter}
-        --train-config${idx} ${config_path}"
-        fi
-        idx=$((idx + 1))
-    done
-
-    cmd="python3 -u ${root_dir}/fairseq_cli/train.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --task ${task}
-        --max-tokens ${max_tokens}
-        --skip-invalid-size-inputs-valid-test
-        --update-freq ${update_freq}
-        --log-interval 100
-        --save-dir ${model_dir}
-        --tensorboard-logdir ${model_dir}"
-
-	if [[ -n ${extra_parameter} ]]; then
-        cmd="${cmd}
-        ${extra_parameter}"
-    fi
-	if [[ ${gpu_num} -gt 0 ]]; then
-		cmd="${cmd}
-        --distributed-world-size $gpu_num
-        --ddp-backend no_c10d"
-	fi
-    if [[ $fp16 -eq 1 ]]; then
-        cmd="${cmd}
-        --fp16"
-    fi
-    if [[ $step_valid -eq 1 ]]; then
-        validate_interval=1
-        save_interval=1
-        keep_last_epochs=10
-        no_epoch_checkpoints=0
-        save_interval_updates=500
-        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
-    fi
-    if [[ $bleu_valid -eq 1 ]]; then
-        cmd="$cmd
-        --eval-bleu
-        --eval-bleu-args '{\"beam\": 1}'
-        --eval-tokenized-bleu
-        --eval-bleu-remove-bpe
-        --best-checkpoint-metric bleu
-        --maximize-best-checkpoint-metric"
-    fi
-    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
-        cmd="$cmd
-        --no-epoch-checkpoints"
-    fi
-    if [[ -n $validate_interval ]]; then
-        cmd="${cmd}
-        --validate-interval $validate_interval "
-    fi
-    if [[ -n $save_interval ]]; then
-        cmd="${cmd}
-        --save-interval $save_interval "
-    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
-    if [[ -n $save_interval_updates ]]; then
-        cmd="${cmd}
-        --save-interval-updates $save_interval_updates"
-        if [[ -n $keep_interval_updates ]]; then
-        cmd="${cmd}
-        --keep-interval-updates $keep_interval_updates"
-        fi
-    fi
-
-    echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-    # save info
-    log=./history.log
-    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
-    tail -n 50 ${log} > tmp.log
-    mv tmp.log $log
-    export CUDA_VISIBLE_DEVICES=${device}
-
-    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
-    if [[ $eval -eq 1 ]]; then
-		eval $cmd
-		sleep 2s
-		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
-	fi
-fi
-wait
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    echo "stage 2: ST Decoding"
-    if [[ ${n_average} -ne 1 ]]; then
-        # Average models
-		dec_model=avg_${n_average}_checkpoint.pt
-
-		cmd="python ${root_dir}/scripts/average_checkpoints.py
-        --inputs ${model_dir}
-        --num-epoch-checkpoints ${n_average}
-        --output ${model_dir}/${dec_model}"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-    	[[ $eval -eq 1 ]] && eval $cmd
-	else
-		dec_model=${dec_model}
-	fi
-
-    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
-		if [[ ${gpu_num} -eq 0 ]]; then
-			device=""
-		else
-        	source ./local/utils.sh
-        	device=$(get_devices $gpu_num 0)
-		fi
-    fi
-    export CUDA_VISIBLE_DEVICES=${device}
-
-	result_file=${model_dir}/decode_result
-	[[ -f ${result_file} ]] && rm ${result_file}
-
-    test_subset=${test_subset//,/ }
-	for subset in "${test_subset[@]}"; do
-        subset=${subset}_st
-  		cmd="python ${root_dir}/fairseq_cli/generate.py
-        ${data_dir}
-        --config-yaml ${data_config}
-        --gen-subset ${subset}
-        --task speech_to_text
-        --path ${model_dir}/${dec_model}
-        --results-path ${model_dir}
-        --max-tokens ${max_tokens}
-        --beam ${beam_size}
-        --lenpen ${len_penalty}
-        --scoring sacrebleu"
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
-
-        if [[ $eval -eq 1 ]]; then
-    	    eval $cmd
-    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
-        fi
-	done
-    cat ${result_file}
-fi
--- a/egs/template/st/train.sh
+++ b/egs/template/st/train.sh
-#! /bin/bash
-
-# training the model
-
-gpu_num=8
-update_freq=1
-max_tokens=40000
-
-extra_tag=
-extra_parameter=
-#extra_tag="${extra_tag}"
-#extra_parameter="${extra_parameter} "
-
-exp_tag=
-
-#config_list=(base)
-config_list=(ctc)
-#config_list=(sate_ctc)
-#config_list=(ctc conformer rpr)
-#config_list=(base sate)
-
-#config_list=(pds_base)
-#config_list=(pds_base conformer)
-
-# exp full name
-exp_name=
-
-train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
-
-cmd="./run.sh
-    --stage 1
-    --stop_stage 1
-    --gpu_num ${gpu_num}
-    --update_freq ${update_freq}
-    --train_config ${train_config}
-    --max_tokens ${max_tokens}
-    "
-
-if [[ -n ${exp_name} ]]; then
-    cmd="$cmd --exp_name ${exp_name}"
-fi
-if [[ -n ${exp_tag} ]]; then
-    cmd="$cmd --exp_tag ${exp_tag}"
-fi
-if [[ -n ${extra_tag} ]]; then
-    cmd="$cmd --extra_tag ${extra_tag}"
-fi
-if [[ -n ${extra_parameter} ]]; then
-    cmd="$cmd --extra_parameter \"${extra_parameter}\""
-fi
-
-echo ${cmd}
-eval ${cmd}
--- a/examples/speech_to_text/prep_audio_data.py
+++ b/examples/speech_to_text/prep_audio_data.py
@@ -13,6 +13,8 @@ from itertools import groupby
 from tempfile import NamedTemporaryFile
 import string
 import csv
+import yaml
+import copy

 import numpy as np
 import pandas as pd
@@ -50,8 +52,6 @@ class AudioDataset(Dataset):
                 tokenizer: bool = False) -> None:
        _root = Path(root) / "data" / split
        wav_root, txt_root = _root / "wav", _root / "txt"
-        if tokenizer:
-            txt_root = _root / "txt.tok"
        assert wav_root.is_dir() and txt_root.is_dir(), (_root, wav_root, txt_root)

        self.use_raw = use_raw
@@ -62,25 +62,30 @@ class AudioDataset(Dataset):
        yaml_file = txt_root / f"{split}.yaml"
        if yaml_file.is_file():
            self.mode = "yaml"
-            try:
-                import yaml
-            except ImportError:
-                print("Please install PyYAML to load the MuST-C YAML files")
-
            with open(yaml_file) as f:
                segments = yaml.load(f, Loader=yaml.BaseLoader)
+                total_length = len(segments)
+
+                if 0 < self.size < total_length:
+                    segments = segments[:self.size]
+                # for idx, seg in enumerate(content):
+                #     segments[idx] = seg
+                #     if 0 < self.size < idx:
+                #         break
        else:
            self.mode = "easy"
+
+            segments = dict()
            audio_file = txt_root / f"{split}.audio"
            assert audio_file.is_file(), audio_file
            with open(audio_file) as f:
                audios = [line.strip() for line in f.readlines()]
+            total_length = len(audios)

-            segments = dict()
+            if 0 < self.size < total_length:
+                audios = audios[:self.size]
            for idx, audio in enumerate(audios):
                segments[idx] = {"audio": audio}
-                if 0 < self.size < idx:
-                    break

        # Load source and target utterances
        self.have_src_utt = False
@@ -88,31 +93,44 @@ class AudioDataset(Dataset):
        for _lang in [src_lang, tgt_lang]:
            if _lang is None:
                continue
-            if Path.exists(txt_root / f"{split}.{_lang}"):
+            txt_path = txt_root / f"{split}.{_lang}"
+            if tokenizer:
+                txt_path = txt_root / f"{split}.{_lang}.tok"
+
+            if Path.exists(txt_path):
                if _lang == src_lang:
                    self.have_src_utt = True
                else:
                    self.have_tgt_utt = True
-                with open(txt_root / f"{split}.{_lang}") as f:
+                with open(txt_path) as f:
                    utterances = [r.strip() for r in f]
-                assert len(audios) == len(utterances)
+                assert total_length == len(utterances), (total_length, len(utterances))
+
+                if 0 < self.size < total_length:
+                    utterances = utterances[:self.size]
                for idx, u in enumerate(utterances):
                    segments[idx][_lang] = u
-                    if 0 < self.size < idx:
-                        break

        # Gather info
        self.data = dict()
        if self.mode == "easy":
+            real_idx = 0
            for idx, v in segments.items():
+                audio_name = v["audio"]
+                v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
                if self.speed_perturb is not None:
                    for perturb in self.speed_perturb:
-                        v["sp"] = perturb
-                        v["id"] = f"{v['audio']}_sp{perturb}"
+                        sp_item = copy.deepcopy(v)
+                        sp_item["perturb"] = perturb
+                        sp_item["id"] = f"{audio_name}_sp{perturb}"
+                        self.data[real_idx] = sp_item
+                        real_idx += 1
                else:
-                    v["id"] = v['audio']
-                v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
-                self.data[idx] = v
+                    v["id"] = audio_name
+                    self.data[real_idx] = v
+                    real_idx += 1
+                if 0 < self.size <= real_idx:
+                    break

        elif self.mode == "yaml":
            idx = 0
@@ -129,14 +147,14 @@ class AudioDataset(Dataset):
                    item["audio"] = wav_path.as_posix()
                    item["offset"] = offset
                    item["n_frames"] = n_frames
-                    item["sample_rate"] = sample_rate,
+                    item["sample_rate"] = sample_rate
                    item[src_lang] = segment[src_lang]
                    if tgt_lang is not None:
                        item[tgt_lang] = segment[tgt_lang]

                    if self.speed_perturb is not None:
                        for perturb in self.speed_perturb:
-                            sp_item = item
+                            sp_item = copy.deepcopy(item)
                            sp_item["id"] = f"{_id}_sp{perturb}"
                            sp_item["perturb"] = perturb
                            self.data[idx] = sp_item
@@ -145,7 +163,7 @@ class AudioDataset(Dataset):
                        item["id"] = _id
                        self.data[idx] = item
                        idx += 1
-                    if 0 < self.size < idx:
+                    if 0 < self.size <= idx:
                        break

    def __getitem__(self, n: int):
@@ -155,7 +173,7 @@ class AudioDataset(Dataset):
        item = self.data[n]
        audio = item["audio"]

-        if getattr(item, "n_frames", False) and getattr(item, "sample_rate", False):
+        if item.get("n_frames", False) and item.get("sample_rate", False):
            n_frames = item["n_frames"]
            sample_rate = item["sample_rate"]
        else:
@@ -164,16 +182,19 @@ class AudioDataset(Dataset):
            n_frames = info.num_frames

        waveform = None
+        if item.get("perturb", False):
+            n_frames = n_frames / item['perturb']
+
        if need_waveform:
-            if getattr(item, "offset", False):
+            offset = item.get('offset', False)
+            if offset:
                waveform, sample_rate = torchaudio.load(audio,
-                                                        frame_offset=item["sample_rate"],
+                                                        frame_offset=offset,
                                                        num_frames=item["n_frames"])
            else:
                waveform, sample_rate = torchaudio.load(audio)

-            if getattr(item, "perturb", False):
-                n_frames = n_frames / item['perturb']
+            if item.get("perturb", False):
                effects = [
                    ["speed", f"{item['perturb']}"],
                    ["rate", f"{sample_rate}"]
@@ -204,6 +225,7 @@ def process(args):
        output_root = Path(args.output_root).absolute()

    # Extract features
+    datasets = dict()
    use_raw = args.raw
    size = args.size
    if args.speed_perturb:
@@ -233,6 +255,8 @@ def process(args):
                                   src_lang, tgt_lang, split,
                                   args.speed_perturb, size, use_raw,
                                   args.tokenizer)
+            if split not in datasets:
+                datasets[split] = dataset

            if is_train_split and args.cmvn_type == "global":
                print("And estimating cepstral mean and variance stats...")
@@ -287,10 +311,13 @@ def process(args):
            is_train_split = split.startswith("train")
            manifest = {c: [] for c in MANIFEST_COLUMNS}

-            dataset = AudioDataset(root.as_posix(),
-                                   src_lang, tgt_lang, split,
-                                   args.speed_perturb, size, use_raw,
-                                   args.tokenizer)
+            if split in datasets:
+                dataset = datasets[split]
+            else:
+                dataset = AudioDataset(root.as_posix(),
+                                       src_lang, tgt_lang, split,
+                                       args.speed_perturb, size, use_raw,
+                                       args.tokenizer)
            if args.task == "st" and args.add_src and dataset.have_src_utt:
                manifest["src_text"] = []
            for idx in tqdm(range(len(dataset))):
@@ -303,7 +330,7 @@ def process(args):
                    audio_path = item["audio"]

                    # add offset and frames info
-                    if getattr(item, "offset", False):
+                    if item.get("offset", False):
                        audio_path = f"{audio_path}:{item['offset']}:{n_frames}"
                    manifest["audio"].append(audio_path)
                else:
@@ -371,7 +398,7 @@ def process(args):
        if len(train_text) == 0:
            print("Loading the training text to build dictionary...")

-            for split in args.SPLITS:
+            for split in splits:
                if split.startswith("train"):
                    csv_path = output_root / f"{split}.tsv"
                    with open(csv_path) as f:
@@ -384,18 +411,18 @@ def process(args):
                            quoting=csv.QUOTE_NONE,
                        )

-                    if task == "st" and args.add_src and args.share:
-                        for e in reader:
-                            src_utt = dict(e)["src_text"]
-                            if args.lowercase_src:
-                                src_utt = src_utt.lower()
-                            if args.rm_punc_src:
-                                for w in string.punctuation:
-                                    src_utt = src_utt.replace(w, "")
-                                src_utt = " ".join(src_utt.split(" "))
-                            train_text.append(src_utt)
-                    tgt_text = [dict(e)["tgt_text"] for e in reader]
-                    train_text.extend(tgt_text)
+                        if task == "st" and args.add_src and args.share:
+                            for e in reader:
+                                src_utt = dict(e)["src_text"]
+                                if args.lowercase_src:
+                                    src_utt = src_utt.lower()
+                                if args.rm_punc_src:
+                                    for w in string.punctuation:
+                                        src_utt = src_utt.replace(w, "")
+                                    src_utt = " ".join(src_utt.split(" "))
+                                train_text.append(src_utt)
+                        tgt_text = [dict(e)["tgt_text"] for e in reader]
+                        train_text.extend(tgt_text)

        with NamedTemporaryFile(mode="w") as f:
            for t in train_text:

--- a/examples/speech_to_text/prep_mt_data.py
+++ b/examples/speech_to_text/prep_mt_data.py
@@ -33,8 +33,8 @@ class MTDataset(Dataset):
    """

    def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None:
-        _root = Path(root) / "data"
-        txt_root = _root
+        _root = Path(root) / "data" / split
+        txt_root = _root / "txt" if (_root / "txt").is_dir() else _root
        assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
        # Load source and target text
        self.data = []

--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -23,7 +23,6 @@ from fairseq.file_io import PathManager
 from fairseq.models import FairseqDecoder, FairseqEncoder
 from omegaconf import Container, DictConfig, open_dict, OmegaConf

-
 logger = logging.getLogger(__name__)


@@ -62,23 +61,28 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
    suffix = trainer.checkpoint_suffix
    checkpoint_conds = collections.OrderedDict()
    checkpoint_conds["checkpoint{}{}.pt".format(epoch, suffix)] = (
-        end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
+            end_of_epoch and not cfg.no_epoch_checkpoints and epoch % cfg.save_interval == 0
    )
    checkpoint_conds["checkpoint_{}_{}{}.pt".format(epoch, updates, suffix)] = (
-        not end_of_epoch
-        and cfg.save_interval_updates > 0
-        and updates % cfg.save_interval_updates == 0
+            not end_of_epoch
+            and cfg.save_interval_updates > 0
+            and updates % cfg.save_interval_updates == 0
    )
    checkpoint_conds["checkpoint_best{}.pt".format(suffix)] = val_loss is not None and (
-        not hasattr(save_checkpoint, "best")
-        or is_better(val_loss, save_checkpoint.best)
+            not hasattr(save_checkpoint, "best")
+            or is_better(val_loss, save_checkpoint.best)
    )
    if val_loss is not None and cfg.keep_best_checkpoints > 0:
+        if not end_of_epoch and cfg.save_interval_updates > 0 and updates % cfg.save_interval_updates == 0:
+            epoch_or_step = updates
+        else:
+            epoch_or_step = epoch
        checkpoint_conds[
-            "checkpoint.best_{}_{:.2f}.pt".format(cfg.best_checkpoint_metric, val_loss)
-        ] = not hasattr(save_checkpoint, "best") or is_better(
-            val_loss, save_checkpoint.best
-        )
+            "checkpoint.best_{}_{}_{:.3f}.pt".format(cfg.best_checkpoint_metric, epoch_or_step, val_loss)
+        ] = True
+        # not hasattr(save_checkpoint, "best") or is_better(
+        # val_loss, save_checkpoint.best
+        # )
    checkpoint_conds[
        "checkpoint_last{}.pt".format(suffix)
    ] = not cfg.no_last_checkpoints
@@ -117,14 +121,14 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
        checkpoints = checkpoint_paths(
            cfg.save_dir, pattern=r"checkpoint_\d+_(\d+)\.pt"
        )
-        for old_chk in checkpoints[cfg.keep_interval_updates :]:
+        for old_chk in checkpoints[cfg.keep_interval_updates:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)

    if cfg.keep_last_epochs > 0:
        # remove old epoch checkpoints; checkpoints are sorted in descending order
        checkpoints = checkpoint_paths(cfg.save_dir, pattern=r"checkpoint(\d+)\.pt")
-        for old_chk in checkpoints[cfg.keep_last_epochs :]:
+        for old_chk in checkpoints[cfg.keep_last_epochs:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)

@@ -132,13 +136,13 @@ def save_checkpoint(cfg: CheckpointConfig, trainer, epoch_itr, val_loss):
        # only keep the best N checkpoints according to validation metric
        checkpoints = checkpoint_paths(
            cfg.save_dir,
-            pattern=r"checkpoint\.best_{}_(\d+\.?\d*)\.pt".format(
+            pattern=r"checkpoint\.best_{}_\d+_(\d+\.?\d*)\.pt".format(
                cfg.best_checkpoint_metric
            ),
        )
        if not cfg.maximize_best_checkpoint_metric:
            checkpoints = checkpoints[::-1]
-        for old_chk in checkpoints[cfg.keep_best_checkpoints :]:
+        for old_chk in checkpoints[cfg.keep_best_checkpoints:]:
            if os.path.lexists(old_chk):
                os.remove(old_chk)

@@ -158,7 +162,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
    reset_dataloader = cfg.reset_dataloader

    if cfg.finetune_from_model is not None and (
-        reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
+            reset_optimizer or reset_lr_scheduler or reset_meters or reset_dataloader
    ):
        raise ValueError(
            "--finetune-from-model can not be set together with either --reset-optimizer"
@@ -167,7 +171,7 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):

    suffix = trainer.checkpoint_suffix
    if (
-        cfg.restore_file == "checkpoint_last.pt"
+            cfg.restore_file == "checkpoint_last.pt"
    ):  # default value of restore_file is 'checkpoint_last.pt'
        checkpoint_path = os.path.join(
            cfg.save_dir, "checkpoint_last{}.pt".format(suffix)
@@ -210,10 +214,10 @@ def load_checkpoint(cfg: CheckpointConfig, trainer, **passthrough_args):
    )

    if (
-        extra_state is not None
-        and "best" in extra_state
-        and not reset_optimizer
-        and not reset_meters
+            extra_state is not None
+            and "best" in extra_state
+            and not reset_optimizer
+            and not reset_meters
    ):
        save_checkpoint.best = extra_state["best"]

@@ -297,13 +301,13 @@ def load_checkpoint_to_cpu(path, arg_overrides=None, load_on_all_ranks=False):


 def load_model_ensemble(
-    filenames,
-    arg_overrides: Optional[Dict[str, Any]] = None,
-    task=None,
-    strict=True,
-    suffix="",
-    num_shards=1,
-    state=None,
+        filenames,
+        arg_overrides: Optional[Dict[str, Any]] = None,
+        task=None,
+        strict=True,
+        suffix="",
+        num_shards=1,
+        state=None,
 ):
    """Loads an ensemble of models.

@@ -314,7 +318,7 @@ def load_model_ensemble(
        task (fairseq.tasks.FairseqTask, optional): task to use for loading
    """
    assert not (
-        strict and num_shards > 1
+            strict and num_shards > 1
    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
    ensemble, args, _task = load_model_ensemble_and_task(
        filenames,
@@ -329,20 +333,20 @@ def load_model_ensemble(


 def load_model_ensemble_and_task(
-    filenames,
-    arg_overrides: Optional[Dict[str, Any]] = None,
-    task=None,
-    strict=True,
-    suffix="",
-    num_shards=1,
-    state=None,
+        filenames,
+        arg_overrides: Optional[Dict[str, Any]] = None,
+        task=None,
+        strict=True,
+        suffix="",
+        num_shards=1,
+        state=None,
 ):
    assert state is None or len(filenames) == 1

    from fairseq import tasks

    assert not (
-        strict and num_shards > 1
+            strict and num_shards > 1
    ), "Cannot load state dict with strict=True and checkpoint shards > 1"
    ensemble = []
    cfg = None
@@ -483,7 +487,7 @@ def _upgrade_state_dict(state):
        state["optimizer_history"][-1]["num_updates"] = 0
    # old model checkpoints may not have separate source/target positions
    if "args" in state and hasattr(state["args"], "max_positions") and not hasattr(
-        state["args"], "max_source_positions"
+            state["args"], "max_source_positions"
    ):
        state["args"].max_source_positions = state["args"].max_positions
        state["args"].max_target_positions = state["args"].max_positions
@@ -518,14 +522,14 @@ def _upgrade_state_dict(state):
            del state["args"].min_lr
        # binary_cross_entropy => wav2vec criterion
        if (
-            hasattr(state["args"], "criterion")
-            and state["args"].criterion == "binary_cross_entropy"
+                hasattr(state["args"], "criterion")
+                and state["args"].criterion == "binary_cross_entropy"
        ):
            state["args"].criterion = "wav2vec"
        # speech_pretraining => audio pretraining
        if (
-            hasattr(state["args"], "task")
-            and state["args"].task == "speech_pretraining"
+                hasattr(state["args"], "task")
+                and state["args"].task == "speech_pretraining"
        ):
            state["args"].task = "audio_pretraining"
        # audio_cpc => wav2vec
@@ -536,9 +540,9 @@ def _upgrade_state_dict(state):
            state["args"].lr = [state["args"].lr]
        # convert task data arg to a string instead of List[string]
        if (
-            hasattr(state["args"], "data")
-            and isinstance(state["args"].data, list)
-            and len(state["args"].data) > 0
+                hasattr(state["args"], "data")
+                and isinstance(state["args"].data, list)
+                and len(state["args"].data) > 0
        ):
            state["args"].data = state["args"].data[0]

@@ -549,23 +553,23 @@ def _upgrade_state_dict(state):
        with open_dict(cfg):
            # any upgrades for Hydra-based configs
            if (
-                "task" in cfg
-                and "eval_wer_config" in cfg.task
-                and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
+                    "task" in cfg
+                    and "eval_wer_config" in cfg.task
+                    and isinstance(cfg.task.eval_wer_config.print_alignment, bool)
            ):
                cfg.task.eval_wer_config.print_alignment = "hard"
            if "generation" in cfg and isinstance(cfg.generation.print_alignment, bool):
                cfg.generation.print_alignment = "hard"
            if (
-                "model" in cfg
-                and "w2v_args" in cfg.model
-                and cfg.model.w2v_args is not None
-                and (
+                    "model" in cfg
+                    and "w2v_args" in cfg.model
+                    and cfg.model.w2v_args is not None
+                    and (
                    hasattr(cfg.model.w2v_args, "task") or "task" in cfg.model.w2v_args
-                )
-                and isinstance(
-                    cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
-                )
+            )
+                    and isinstance(
+                cfg.model.w2v_args.task.eval_wer_config.print_alignment, bool
+            )
            ):
                cfg.model.w2v_args.task.eval_wer_config.print_alignment = "hard"

@@ -644,9 +648,9 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):
                    layer_name
                )
                new_state_key = (
-                    layer_name[: substitution_match.start(1)]
-                    + new_layer_number
-                    + layer_name[substitution_match.end(1) :]
+                        layer_name[: substitution_match.start(1)]
+                        + new_layer_number
+                        + layer_name[substitution_match.end(1):]
                )
                new_state_dict[new_state_key] = state_dict[layer_name]

@@ -666,7 +670,7 @@ def prune_state_dict(state_dict, model_cfg: Optional[DictConfig]):


 def load_pretrained_component_from_model(
-    component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True,
+        component: Union[FairseqEncoder, FairseqDecoder], checkpoint: str, strict: bool = True,
 ):
    """
    Load a pretrained FairseqEncoder or FairseqDecoder from checkpoint into the
@@ -699,7 +703,7 @@ def load_pretrained_component_from_model(
    for key in state["model"].keys():
        if key.startswith(component_type):
            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
-            component_subkey = key[len(component_type) + 1 :]
+            component_subkey = key[len(component_type) + 1:]
            component_state_dict[component_subkey] = state["model"][key]

    mismatch_keys = []

--- a/fairseq/criterions/ctc.py
+++ b/fairseq/criterions/ctc.py
@@ -251,7 +251,7 @@ class CtcCriterion(FairseqCriterion):

        if c_total > 0:
            metrics.log_derived(
-                "uer",
+                "cer",
                lambda meters: safe_round(
                    meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
                )

--- a/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -92,14 +92,14 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
            logging_output["total"] = utils.item(total.data)

        if self.ctc_weight > 0:
-            ctc_loss = self.compute_ctc_loss(model, sample, encoder_out)
+            ctc_loss, logging_output = self.compute_ctc_loss(model, sample, encoder_out, logging_output)
            logging_output["ctc_loss"] = utils.item(ctc_loss.data)
            loss = (1 - self.ctc_weight) * loss + self.ctc_weight * ctc_loss
        logging_output["loss"] = utils.item(loss.data) if reduce else loss.data

        return loss, sample_size, logging_output

-    def compute_ctc_loss(self, model, sample, encoder_out):
+    def compute_ctc_loss(self, model, sample, encoder_out, logging_output):
        transcript = sample["transcript"]
        ctc_logit = encoder_out["ctc_logit"][0]
        lprobs = model.get_normalized_probs(
@@ -124,9 +124,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
                transcript_lengths,
            )

-        logging_output = {
-            "ctc_loss": utils.item(loss.data),  # * sample['ntokens'],
-        }
+        logging_output["ctc_loss"] = utils.item(loss.data)

        if not model.training:
            import editdistance
@@ -182,7 +180,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
                logging_output["c_errors"] = c_err
                logging_output["c_total"] = c_len

-        return loss
+        return loss, logging_output

    @staticmethod
    def reduce_metrics(logging_outputs) -> None:
@@ -205,18 +203,20 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
        metrics.log_scalar(
            "loss", loss_sum / sample_size / math.log(2), sample_size, round=3
        )
-        metrics.log_scalar(
-            "trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3
-        )
+        if trans_loss_sum != loss_sum:
+            metrics.log_scalar(
+                "trans_loss", trans_loss_sum / ntokens / math.log(2), ntokens, round=3
+            )
        metrics.log_scalar(
            "nll_loss", nll_loss_sum / ntokens / math.log(2), ntokens, round=3
        )
-        metrics.log_scalar(
-            "ctc_loss",
-            ctc_loss_sum / sample_size / math.log(2),
-            sample_size,
-            round=3,
-        )
+        if ctc_loss_sum > 0:
+            metrics.log_scalar(
+                "ctc_loss",
+                ctc_loss_sum / sample_size / math.log(2),
+                sample_size,
+                round=3,
+            )
        metrics.log_derived(
            "ppl", lambda meters: utils.get_perplexity(meters["nll_loss"].avg)
        )
@@ -250,7 +250,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(

        if c_total > 0:
            metrics.log_derived(
-                "uer",
+                "cer",
                lambda meters: safe_round(
                    meters["_c_errors"].sum * 100.0 / meters["_c_total"].sum, 3
                )

--- a/fairseq/data/audio/audio_utils.py
+++ b/fairseq/data/audio/audio_utils.py
@@ -6,7 +6,10 @@ import torch
 import torchaudio

 def get_waveform(
-    path_or_fp: Union[str, BinaryIO], normalization=True
+        path_or_fp: Union[str, BinaryIO],
+        normalization=True,
+        offset=None,
+        size=None
 ) -> Tuple[np.ndarray, int]:
    """Get the waveform and sample rate of a 16-bit mono-channel WAV or FLAC.

@@ -19,7 +22,10 @@ def get_waveform(
        if ext not in {".flac", ".wav"}:
            raise ValueError(f"Unsupported audio format: {ext}")

-    waveform, sample_rate = torchaudio.load(path_or_fp)
+    if offset is not None and size is not None:
+        waveform, sample_rate = torchaudio.load(path_or_fp, frame_offset=offset, num_frames=size)
+    else:
+        waveform, sample_rate = torchaudio.load(path_or_fp)
    waveform = waveform.squeeze().numpy()

    if not normalization:
@@ -73,12 +79,17 @@ def _get_torchaudio_fbank(waveform, sample_rate, n_bins=80) -> Optional[np.ndarr
        return None


-def get_fbank(path_or_fp: Union[str, BinaryIO], n_bins=80) -> np.ndarray:
+def get_fbank(
+        path_or_fp: Union[str, BinaryIO],
+        n_bins=80,
+        offset=None,
+        size=None,
+) -> np.ndarray:
    """Get mel-filter bank features via PyKaldi or TorchAudio. Prefer PyKaldi
    (faster CPP implementation) to TorchAudio (Python implementation). Note that
    Kaldi/TorchAudio requires 16-bit signed integers as inputs and hence the
    waveform should not be normalized."""
-    sound, sample_rate = get_waveform(path_or_fp, normalization=False)
+    sound, sample_rate = get_waveform(path_or_fp, normalization=False, offset=offset, size=size)

    features = _get_kaldi_fbank(sound, sample_rate, n_bins)
    if features is None:

--- a/fairseq/data/audio/speech_to_text_dataset.py
+++ b/fairseq/data/audio/speech_to_text_dataset.py
@@ -182,6 +182,14 @@ def get_features_or_waveform_from_uncompressed_zip(
    return features_or_waveform


+def get_features_or_waveform_from_audio(
+        path, offset, size, need_waveform=False
+):
+    assert path.endswith(".wav")
+    features_or_waveform = get_waveform(path, offset=offset, size=size)[0] if need_waveform \
+        else get_fbank(path, offset=offset, size=size)
+    return features_or_waveform
+
 def get_features_or_waveform(path: str, need_waveform=False):
    """Get speech features from .npy file or waveform from .wav/.flac file.
    The file may be inside an uncompressed ZIP file and is accessed via byte
@@ -205,9 +213,14 @@ def get_features_or_waveform(path: str, need_waveform=False):
        return get_features_from_npy_or_audio(_path)
    elif len(extra) == 2:
        extra = [int(i) for i in extra]
-        features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
-            _path, extra[0], extra[1], need_waveform=need_waveform
-        )
+        if _path.endswith('.zip'):
+            features_or_waveform = get_features_or_waveform_from_uncompressed_zip(
+                _path, extra[0], extra[1], need_waveform=need_waveform
+            )
+        else:
+            features_or_waveform = get_features_or_waveform_from_audio(
+                _path, extra[0], extra[1], need_waveform=need_waveform
+            )
    else:
        raise ValueError(f"Invalid path: {path}")


--- a/fairseq/models/dlcl_transformer.py
+++ b/fairseq/models/dlcl_transformer.py
@@ -54,12 +54,12 @@ class DLCLTransformerModel(TransformerModel):
        TransformerModel.add_args(parser)
        
        # dense layer parameters
-        parser.add_argument('--encoder-history-type',
-                            default="learnable_dense",
-                            help='encoder layer history type')
-        parser.add_argument('--decoder-history-type',
-                            default="learnable_dense",
-                            help='decoder layer history type')
+        # parser.add_argument('--encoder-history-type',
+        #                     default="learnable_dense",
+        #                     help='encoder layer history type')
+        # parser.add_argument('--decoder-history-type',
+        #                     default="learnable_dense",
+        #                     help='decoder layer history type')
        parser.add_argument('--encoder-integration-type', choices=['avg', 'sum'],
                            help='encoder layer integration type')
        parser.add_argument('--decoder-integration-type', choices=['avg', 'sum'],

--- a/fairseq/models/speech_to_text/pdss2t_transformer.py
+++ b/fairseq/models/speech_to_text/pdss2t_transformer.py
@@ -463,7 +463,11 @@ class PDSS2TTransformerModel(S2TTransformerModel):
            type=float,
            help="dropout in each stage",
        )
-
+        parser.add_argument(
+            "--ctc-layer",
+            type=int,
+            help="the layer of ctc",
+        )
        pass

    @classmethod
@@ -516,8 +520,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
        else:
            self.pds_attn_ds_ratios = None

-        self.fusion = getattr(args, "pds_fusion", False)
-        self.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
+        self.pds_fusion = args.pds_fusion
+        self.pds_fusion_method = args.pds_fusion_method
+
        self.pds_fusion_transform = "conv"
        if len(self.pds_fusion_method.split("_")) == 2:
            items = self.pds_fusion_method.split("_")
@@ -525,10 +530,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
            self.pds_fusion_transform = items[1]

        fusion_stages_num = 0
-        if self.fusion:
-            if self.pds_fusion_way == "all":
+        if self.pds_fusion:
+            if self.pds_fusion_method == "all":
                fusion_stages_num = self.pds_stages
-            elif self.pds_fusion_way == "same":
+            elif self.pds_fusion_method == "same":
                for dim in self.pds_embed_dims:
                    if dim == self.embed_dim:
                        fusion_stages_num += 1
@@ -555,7 +560,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                        "fusion {}, fusion method {}, fusion transformer {}.".
                        format(i, num_layers, ds_ratio, embed_dim,
                               kernel_size, use_pos_embed, ffn_ratio, num_head,
-                               self.fusion, self.pds_fusion_method, self.pds_fusion_transform))
+                               self.pds_fusion, self.pds_fusion_method, self.pds_fusion_transform))

            if i == 0:
                self.embed_scale = math.sqrt(embed_dim)
@@ -588,7 +593,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                    self.pds_fusion_method == "same" and self.embed_dim == embed_dim
                ):
                    if i != self.pds_stages - 1:
-                        ratio = reduce(lambda a, b: a * b, self.pds_sr_ratios[i + 1:])
+                        ratio = reduce(lambda a, b: a * b, self.pds_ratios[i + 1:])
                    else:
                        ratio = 1

@@ -636,8 +641,10 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                       (("ctc" in getattr(args, "criterion", False)) and
                        (getattr(args, "ctc_weight", False) > 0))
        if self.use_ctc:
-            self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
-            self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
+            # self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
+            # self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
+            self.ctc_layer = args.encoder_layers
+            self.inter_ctc = True if self.ctc_layer != 0 else False
            if self.inter_ctc:
                logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)

@@ -655,7 +662,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                           dropout=args.dropout,
                           need_layernorm=True if self.inter_ctc else False)

-            if task.source_dictionary == task.target_dictionary:
+            if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
                self.ctc.ctc_projection.weight = embed_tokens.weight

        if args.encoder_normalize_before:
@@ -747,7 +754,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                    self.add_to_dict(x, dis, cos_sim_idx)

                if self.use_ctc and self.inter_ctc and self.ctc_layer == layer_idx:
-                    ctc_logit = self.CTC(x)
+                    ctc_logit = self.ctc(x.clone())

            prev_state.append(x)
            prev_padding.append(encoder_padding_mask)
@@ -787,7 +794,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):

        return {
            "encoder_out": [x],  # T x B x C
-            "ctc_logit": [ctc_logit],  # T x B x C
+            "ctc_logit": [] if ctc_logit is None else [ctc_logit],  # T x B x C
            "encoder_padding_mask": [encoder_padding_mask],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
@@ -800,10 +807,9 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
            [] if len(encoder_out["encoder_out"]) == 0
            else [x.index_select(1, new_order) for x in encoder_out["encoder_out"]]
        )
-
        new_ctc_logit = (
            [] if len(encoder_out["ctc_logit"]) == 0
-            else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]]
+            else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
        )

        new_encoder_padding_mask = (
@@ -823,7 +829,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):

        return {
            "encoder_out": new_encoder_out,  # T x B x C
-            "ctc_logit": [new_ctc_logit],  # T x B x C
+            "ctc_logit": new_ctc_logit,  # T x B x C
            "encoder_padding_mask": new_encoder_padding_mask,  # B x T
            "encoder_embedding": new_encoder_embedding,  # B x T x C
            "encoder_states": encoder_states,  # List[T x B x C]
@@ -901,8 +907,8 @@ def base_architecture(args):
    args.ctc_layer = getattr(args, "ctc_layer", 0)
    args.pds_dropout = getattr(args, "pds_dropout", args.dropout)

-    args.fusion = getattr(args, "fusion", False)
-    args.fusion_method = getattr(args, "fusion_method", "all_conv")
+    args.pds_fusion = getattr(args, "pds_fusion", False)
+    args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")


 def set_pds_base_8(args):

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -130,8 +130,8 @@ class S2TSATEModel(S2TTransformerModel):
                component=encoder.text_encoder, checkpoint=args.load_pretrained_text_encoder_from, strict=False
            )

-        if args.share_ctc_and_adapter and hasattr(encoder.adapter, "linear_adapter"):
-            encoder.acoustic_encoder.ctc_projection.weight = encoder.adapter.linear_adapter[0].weight
+        if args.share_ctc_and_adapter and hasattr(encoder.adapter, "embed_adapter"):
+            encoder.acoustic_encoder.ctc.ctc_projection.weight = encoder.adapter.embed_adapter.weight

        return encoder

@@ -175,10 +175,7 @@ class Adapter(nn.Module):
            self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
            self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
    
-        self.out_layernorm = LayerNorm(embed_dim)
-        # self.out_layernorm = nn.Identity()
-
-    def forward(self, x, padding): 
+    def forward(self, x, padding):

        representation, distribution = x
        batch, seq_len, embed_dim = representation.size()
@@ -188,30 +185,25 @@ class Adapter(nn.Module):

        if self.adapter_type == "linear":
            out = self.linear_adapter(representation)
-            out = self.out_layernorm(out)

        elif self.adapter_type == "context":
            out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
-            out = self.out_layernorm(out)

        elif self.adapter_type == "subsample":
            representation = representation.transpose(0, 1)
            out, input_lengths = self.subsample_adaptor(representation, lengths)
            padding = lengths_to_padding_mask(input_lengths)
-            out = self.out_layernorm(out)

        elif self.adapter_type == "league":
            linear_out = self.linear_adapter(representation)
            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
            out = linear_out + soft_out
-            out = self.out_layernorm(out)

        elif self.adapter_type == "gated_league":
            linear_out = self.linear_adapter(representation)
            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
            coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
            out = coef * linear_out + (1 - coef) * soft_out
-            out = self.out_layernorm(out)

        elif self.adapter_type == "none":
            out = representation
@@ -223,10 +215,10 @@ class Adapter(nn.Module):
        return out, padding


-class TextEncoder(nn.Module):
+class TextEncoder(FairseqEncoder):
    def __init__(self, args, dictionary):

-        super().__init__()
+        super().__init__(None)

        self.embed_tokens = None

@@ -285,9 +277,9 @@ class S2TSATEEncoder(FairseqEncoder):
        # acoustic encoder
        acoustic_encoder_type = args.acoustic_encoder
        if acoustic_encoder_type == "transformer":
-            self.acoustic_encoder = S2TTransformerEncoder(args, task, embed_tokens)
+            self.acoustic_encoder = S2TTransformerEncoder(args, task)
        elif acoustic_encoder_type == "pds":
-            self.acoustic_encoder = PDSS2TTransformerEncoder(args, task, embed_tokens)
+            self.acoustic_encoder = PDSS2TTransformerEncoder(args, task)
        else:
            logging.error("Unsupported model arch {}!".format(acoustic_encoder_type))

@@ -295,8 +287,8 @@ class S2TSATEEncoder(FairseqEncoder):
        self.temperature = args.temperature
        self.adapter = Adapter(args, task.source_dictionary, embed_tokens)

-        if args.share_ctc_and_adapter and hasattr(self.adapter, "linear_adapter"):
-            self.acoustic_encoder.ctc_projection.weight = self.adapter.linear_adapter[0].weight
+        if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
+            self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight

        # self.length_adapter = Conv1dSubsampler(
        #     args.encoder_embed_dim,
@@ -463,6 +455,7 @@ def base_architecture(args):
    args.temperature = getattr(args, "temperature", 1.0)
    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
    args.text_attention_type = getattr(args, "text_attention_type", "selfattn")
+    args.share_ctc_and_adapter = getattr(args, "share_ctc_and_adapter", False)

    # PDS
    args.pds_stages = getattr(args, "pds_stages", None)

--- a/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/fairseq/models/speech_to_text/s2t_transformer.py
@@ -511,8 +511,8 @@ class S2TTransformerEncoder(FairseqEncoder):
        self.use_ctc = "sate" in args.arch or \
                       (("ctc" in getattr(args, "criterion", "")) and (getattr(args, "ctc_weight", 0) > 0))
        if self.use_ctc:
-            self.ctc_layer = (args.encoder_layers + args.ctc_layer) % args.encoder_layers
-            self.inter_ctc = True if self.ctc_layer != args.encoder_layers else False
+            self.ctc_layer = args.ctc_layer
+            self.inter_ctc = True if self.ctc_layer != 0 and self.ctc_layer != args.encoder_layers else False
            if self.inter_ctc:
                logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)
            self.ctc = CTC(args.encoder_embed_dim,
@@ -520,7 +520,7 @@ class S2TTransformerEncoder(FairseqEncoder):
                           dropout=args.dropout,
                           need_layernorm=True if self.inter_ctc else False)

-            if task.source_dictionary == task.target_dictionary:
+            if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
                self.ctc.ctc_projection.weight = embed_tokens.weight

        self.interleaved_dropout = getattr(args, "interleave_dropout", None)
@@ -554,7 +554,6 @@ class S2TTransformerEncoder(FairseqEncoder):

    def forward(self, src_tokens, src_lengths):

-        ctc_input = None
        if self.history is not None:
            self.history.clean()

@@ -632,7 +631,7 @@ class S2TTransformerEncoder(FairseqEncoder):

        return {
            "encoder_out": [x],  # T x B x C
-            "ctc_logit": [ctc_logit],    # B x T x C
+            "ctc_logit": [] if ctc_logit is None else [ctc_logit],    # B x T x C
            "encoder_padding_mask": [encoder_padding_mask],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
@@ -648,7 +647,7 @@ class S2TTransformerEncoder(FairseqEncoder):

        new_ctc_logit = (
            [] if len(encoder_out["ctc_logit"]) == 0
-            else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"]]
+            else [x.index_select(1, new_order) for x in encoder_out["ctc_logit"] if x is not None]
        )

        new_encoder_padding_mask = (

--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -218,6 +218,28 @@ class TransformerModel(FairseqEncoderDecoderModel):
            ],
            help="transformer decoder self-attention layer type"
        )
+        parser.add_argument(
+            "--use-enc-dlcl",
+            default=False,
+            action='store_true',
+            help="use dlcl encoder",
+        )
+        parser.add_argument(
+            "--use-dec-dlcl",
+            default=False,
+            action='store_true',
+            help="use dlcl encoder",
+        )
+        parser.add_argument(
+            '--encoder-history-type',
+            default="learnable_dense",
+            help='encoder layer history type'
+        )
+        parser.add_argument(
+            '--decoder-history-type',
+            default="learnable_dense",
+            help='decoder layer history type'
+        )
        parser.add_argument('--max-encoder-relative-length', type=int, default=-1,
                            help='the max encoder relative length')
        parser.add_argument('--max-decoder-relative-length', type=int, default=-1,
@@ -474,6 +496,11 @@ class TransformerEncoder(FairseqEncoder):
        else:
            self.layer_norm = None

+        if getattr(args, "use_enc_dlcl", False):
+            self.history = CreateLayerHistory(args, is_encoder=True)
+        else:
+            self.history = None
+
    def build_encoder_layer(self, args):
        layer = TransformerEncoderLayer(args)
        if getattr(args, "checkpoint_activations", False):
@@ -571,6 +598,9 @@ class TransformerEncoder(FairseqEncoder):
        encoder_padding_mask = src_tokens.eq(self.padding_idx)
        has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any())

+        if self.history is not None:
+            self.history.clean()
+
        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)

        # account for padding while computing the representation
@@ -585,8 +615,15 @@ class TransformerEncoder(FairseqEncoder):
        if return_all_hiddens:
            encoder_states.append(x)

+        # add emb into history
+        if self.history is not None:
+            self.history.add(x)
+
        # encoder layers
        for layer in self.layers:
+            if self.history is not None:
+                x = self.history.pop()
+
            x = layer(
                x, encoder_padding_mask=encoder_padding_mask if has_pads else None
            )
@@ -594,6 +631,12 @@ class TransformerEncoder(FairseqEncoder):
                assert encoder_states is not None
                encoder_states.append(x)

+            if self.history is not None:
+                self.history.add(x)
+
+        if self.history is not None:
+            x = self.history.pop()
+
        if self.layer_norm is not None:
            x = self.layer_norm(x)


--- a/fairseq_cli/train.py
+++ b/fairseq_cli/train.py
@@ -67,6 +67,9 @@ def main(cfg: FairseqConfig) -> None:
    # Print args
    logger.info(cfg)

+    with open(os.path.join(cfg.checkpoint.save_dir, "config.yaml"), 'w') as f:
+        f.write("%s" % OmegaConf.to_yaml(cfg))
+
    if cfg.checkpoint.write_checkpoints_asynchronously:
        try:
            import iopath  # noqa: F401

--- a/scripts/average_checkpoints.py
+++ b/scripts/average_checkpoints.py
@@ -73,33 +73,38 @@ def average_checkpoints(inputs):
    return new_state


-def last_n_checkpoints(paths, n, update_based, upper_bound=None):
+def last_n_checkpoints(paths, n, combine_choice, upper_bound=None, max_metric=False):
    assert len(paths) == 1
    path = paths[0]
-    if update_based:
+    reverse = True
+    if combine_choice == "update":
        pt_regexp = re.compile(r"checkpoint_\d+_(\d+)\.pt")
+    elif combine_choice == "best":
+        reverse = True if max_metric else False
+        pt_regexp = re.compile(r"checkpoint\.best_loss_\d+_(\d+\.?\d*)\.pt")
    else:
        pt_regexp = re.compile(r"checkpoint(\d+)\.pt")
+
    files = PathManager.ls(path)

    entries = []
    for f in files:
        m = pt_regexp.fullmatch(f)
        if m is not None:
-            sort_key = int(m.group(1))
+            sort_key = float(m.group(1))
            if upper_bound is None or sort_key <= upper_bound:
                entries.append((sort_key, m.group(0)))
    if len(entries) < n:
        raise Exception(
            "Found {} checkpoint files but need at least {}", len(entries), n
        )
-    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=True)[:n]]
+    return [os.path.join(path, x[1]) for x in sorted(entries, reverse=reverse)[:n]]


 def main():
    parser = argparse.ArgumentParser(
        description="Tool to average the params of input checkpoints to "
-        "produce a new checkpoint",
+                    "produce a new checkpoint",
    )
    # fmt: off
    parser.add_argument('--inputs', required=True, nargs='+',
@@ -109,42 +114,55 @@ def main():
    num_group = parser.add_mutually_exclusive_group()
    num_group.add_argument('--num-epoch-checkpoints', type=int,
                           help='if set, will try to find checkpoints with names checkpoint_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                                'and average last this many of them.')
    num_group.add_argument('--num-update-checkpoints', type=int,
                           help='if set, will try to find checkpoints with names checkpoint_ee_xx.pt in the path specified by input, '
-                           'and average last this many of them.')
+                                'and average last this many of them.')
+    num_group.add_argument('--num-best-checkpoints', type=int,
+                           help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
+                                'and average last this many of them.')
+    num_group.add_argument('--maximize-best-checkpoint-metric', default=False, action="store_true",
+                           help='if set, will try to find checkpoints with names checkpoint.best_{metric}_{epoch}_{performance}.pt in the path specified by input, '
+                                'and average last this many of them.')
    parser.add_argument('--checkpoint-upper-bound', type=int,
                        help='when using --num-epoch-checkpoints, this will set an upper bound on which epoch to use, '
-                        'when using --num-update-checkpoints, this will set an upper bound on which update to use'
-                        'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
-                        'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500'
+                             'when using --num-update-checkpoints, this will set an upper bound on which update to use'
+                             'e.g., with --num-epoch-checkpoints=10 --checkpoint-upper-bound=50, checkpoints 41-50 would be averaged.'
+                             'e.g., with --num-update-checkpoints=10 --checkpoint-upper-bound=50000, checkpoints 40500-50000 would be averaged assuming --save-interval-updates 500.'
+                             'e.g., with --num-best-checkpoints=10 --checkpoint-upper-bound=5, checkpoints (loss) <= 5 would be averaged.'
                        )
    # fmt: on
    args = parser.parse_args()
    print(args)

    num = None
-    is_update_based = False
+    combine_choice = "epoch"
+    max_metric = args.maximize_best_checkpoint_metric
    if args.num_update_checkpoints is not None:
        num = args.num_update_checkpoints
-        is_update_based = True
+        combine_choice = "update"
    elif args.num_epoch_checkpoints is not None:
        num = args.num_epoch_checkpoints
+    elif args.num_best_checkpoints is not None:
+        num = args.num_best_checkpoints
+        combine_choice = "best"

    assert args.checkpoint_upper_bound is None or (
-        args.num_epoch_checkpoints is not None
-        or args.num_update_checkpoints is not None
-    ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints"
+            args.num_epoch_checkpoints is not None
+            or args.num_update_checkpoints is not None
+            or args.num_best_checkpoints is not None
+    ), "--checkpoint-upper-bound requires --num-epoch-checkpoints or --num-update-checkpoints or --num-best-checkpoints"
    assert (
-        args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
+            args.num_epoch_checkpoints is None or args.num_update_checkpoints is None
    ), "Cannot combine --num-epoch-checkpoints and --num-update-checkpoints"

    if num is not None:
        args.inputs = last_n_checkpoints(
            args.inputs,
            num,
-            is_update_based,
+            combine_choice,
            upper_bound=args.checkpoint_upper_bound,
+            max_metric=max_metric
        )
        print("averaging checkpoints: ", args.inputs)