fix the bugs and use float32 for softmax

bab6c520 · xuchen · b0a45459 · bab6c520 · bab6c520 · bab6c520
Commit bab6c520 authored Feb 24, 2022 by xuchen
--- a/egs/iwslt2022/asr/binary.sh
+++ b/egs/iwslt2022/asr/binary.sh
+set -e
+eval=1
+lcrm=1
+tokenizer=0
+vocab_type=unigram
+vocab_size=5000
+use_raw_audio=0
+speed_perturb=0
+dataset=iwslt2022
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
+use_specific_dict=0
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
+asr_vocab_prefix=spm_unigram10000_st_share
+src_lang=en
+tgt_lang=zh
+subsets=(train_covost)
+mkdir -p $data_dir
+splits=$(echo ${subsets[*]} | sed 's/ /_/g')
+cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+    --data-root ${org_data_dir}
+    --output-root ${data_dir}
+    --task asr
+    --src-lang ${src_lang}
+    --splits ${splits}
+    --vocab-type ${vocab_type}
+    --vocab-size ${vocab_size}"
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    cmd="$cmd
+    --raw"
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+    cmd="$cmd
+    --asr-prefix ${asr_vocab_prefix}"
+fi
+if [[ ${speed_perturb} -eq 1 ]]; then
+    cmd="$cmd
+    --speed-perturb"
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    cmd="$cmd
+    --lowercase-src
+    --rm-punc-src"
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    cmd="$cmd
+    --tokenizer"
+fi
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/iwslt2022/asr/conf/base.yaml
+++ b/egs/iwslt2022/asr/conf/base.yaml
+arch: s2t_transformer_s
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 1024
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+attention-dropout: 0.1
+activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/basis.yaml
+++ b/egs/iwslt2022/asr/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/big.yaml
+++ b/egs/iwslt2022/asr/conf/big.yaml
+arch: s2t_transformer_m
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 2048
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.15
+activation-fn: relu
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/iwslt2022/asr/conf/conformer.yaml
+++ b/egs/iwslt2022/asr/conf/conformer.yaml
+macaron-style: True
+use-cnn-module: True
+cnn-module-kernel: 31
+encoder-attention-type: rel_pos
+encoder-activation-fn: swish
--- a/egs/iwslt2022/asr/conf/ctc.yaml
+++ b/egs/iwslt2022/asr/conf/ctc.yaml
+ctc-weight: 0.3
+post-process: sentencepiece
--- a/egs/iwslt2022/asr/conf/dlcl.yaml
+++ b/egs/iwslt2022/asr/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/iwslt2022/asr/conf/inter.yaml
+++ b/egs/iwslt2022/asr/conf/inter.yaml
+ctc-weight: 0.2
+intermedia-ctc-layers: 6,9
+intermedia-adapter: league
+intermedia-ctc-weight: 0.1
+ctc-self-distill-weight: 0
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/local_attn.yaml
+++ b/egs/iwslt2022/asr/conf/local_attn.yaml
+encoder-attention-type: local
+hard-mask-window: 0
+gauss-mask-sigma: 3
+init-mask-weight: 0
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/pds_base.yaml
+++ b/egs/iwslt2022/asr/conf/pds_base.yaml
+arch: pdss2t_transformer_s_8
+pds-fusion: True
+ctc-layer: 12
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/pds_base_16.yaml
+++ b/egs/iwslt2022/asr/conf/pds_base_16.yaml
+arch: pdss2t_transformer_s_16
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/pds_base_32.yaml
+++ b/egs/iwslt2022/asr/conf/pds_base_32.yaml
+arch: pdss2t_transformer_s_32
+encoder-embed-dim: 256
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/pds_base_8.yaml
+++ b/egs/iwslt2022/asr/conf/pds_base_8.yaml
+arch: pdss2t_transformer_s_8
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/iwslt2022/asr/conf/pds_big_8.yaml
+++ b/egs/iwslt2022/asr/conf/pds_big_8.yaml
+arch: pdss2t_transformer_m_8
+encoder-embed-dim: 512
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.15
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/iwslt2022/asr/conf/purectc.yaml
+++ b/egs/iwslt2022/asr/conf/purectc.yaml
+arch: s2t_ctc
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: ctc
+zero_infinity: True
+post-process: sentencepiece
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 1024
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+encoder-attention-heads: 4
+#load-pretrained-encoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/asr/conf/rpr.yaml
+++ b/egs/iwslt2022/asr/conf/rpr.yaml
+encoder-attention-type: rel_selfattn
+#encoder-attention-type: relative
+#max-encoder-relative-length: 100
--- a/egs/iwslt2022/asr/decode.sh
+++ b/egs/iwslt2022/asr/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(dev tst-COMMON)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+cer=0
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --n_average ${n_average}
+    --cer ${cer}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ ${#test_subset[@]} -ne 0 ]]; then
+    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
+    cmd="$cmd --test_subset ${subsets}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/iwslt2022/asr/local/monitor.sh
+++ b/egs/iwslt2022/asr/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/iwslt2022/asr/local/parse_options.sh
+++ b/egs/iwslt2022/asr/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/iwslt2022/asr/local/utils.sh
+++ b/egs/iwslt2022/asr/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/iwslt2022/asr/run.sh
+++ b/egs/iwslt2022/asr/run.sh
+#! /bin/bash
+# Processing MuST-C Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d_%H%M")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+#device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+tgt_lang=de
+lang=${src_lang}-${tgt_lang}
+dataset=mustc
+task=speech_to_text
+vocab_type=unigram
+vocab_size=5000
+speed_perturb=0
+lcrm=0
+tokenizer=0
+use_raw_audio=0
+use_specific_dict=1
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
+asr_vocab_prefix=spm_unigram10000_st_share
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
+train_split=train
+valid_split=dev
+test_split=tst-COMMON
+test_subset=tst-COMMON
+# exp
+exp_prefix=$(date "+%m%d")
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=base
+data_config=config.yaml
+# training setting
+fp16=1
+max_tokens=40000
+step_valid=0
+# decoding setting
+cer=0
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+if [[ ${speed_perturb} -eq 1 ]]; then
+    data_dir=${data_dir}_sp
+    exp_prefix=${exp_prefix}_sp
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    data_dir=${data_dir}_${specific_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    data_dir=${data_dir}_raw
+    exp_prefix=${exp_prefix}_raw
+fi
+. ./local/parse_options.sh || exit 1;
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: ASR Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    feature_zip=fbank80.zip
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
+    fi
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
+        ln -s ${data_dir}/../feature_zip ${data_dir}
+    fi
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+        --data-root ${org_data_dir}
+        --output-root ${data_dir}
+        --task asr
+        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
+        --vocab-type ${vocab_type}
+        --vocab-size ${vocab_size}"
+    if [[ ${use_raw_audio} -eq 1 ]]; then
+        cmd="$cmd
+        --raw"
+    fi
+    if [[ ${use_specific_dict} -eq 1 ]]; then
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+        cmd="$cmd
+        --asr-prefix ${asr_vocab_prefix}"
+    fi
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        cmd="$cmd
+        --speed-perturb"
+    fi
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="$cmd
+        --lowercase-src
+        --rm-punc-src"
+    fi
+    if [[ ${tokenizer} -eq 1 ]]; then
+        cmd="$cmd
+        --tokenizer"
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
+    fi
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: ASR Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        keep_last_epochs=10
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    else
+        validate_interval=1
+        keep_last_epochs=10
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $keep_last_epochs ]]; then
+        cmd="${cmd}
+        --keep-last-epochs $keep_last_epochs "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+	fi
+fi
+wait
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: ASR Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=${test_subset//,/ }
+	for subset in ${test_subset[@]}; do
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --gen-subset ${subset}
+        --task speech_to_text
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}
+        --scoring wer
+        --wer-tokenizer 13a
+        --wer-lowercase
+        --wer-remove-punct
+        "
+        if [[ ${cer} -eq 1 ]]; then
+            cmd="${cmd}
+        --wer-char-level"
+        fi
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/iwslt2022/asr/train.sh
+++ b/egs/iwslt2022/asr/train.sh
+#! /bin/bash
+# training the model
+gpu_num=8
+update_freq=1
+max_tokens=40000
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+exp_tag=
+config_list=(base ctc)
+config_list=(purectc)
+#config_list=(base conformer)
+#config_list=(pds_base_16)
+#config_list=(pds_base_16 conformer rpr)
+# exp full name
+exp_name=
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/iwslt2022/mt/binary.sh
+++ b/egs/iwslt2022/mt/binary.sh
+set -e
+eval=1
+lcrm=0
+root_dir=~/st/Fairseq-S2T
+data_dir=/home/xuchen/st/data/wmt/test
+vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
+src_vocab_prefix=spm_unigram32000_share
+tgt_vocab_prefix=spm_unigram32000_share
+src_lang=en
+tgt_lang=de
+tokenize=1
+splits=(newstest2014 newstest2016)
+for split in ${splits[@]}; do
+    src_file=${data_dir}/${split}.${src_lang}
+    tgt_file=${data_dir}/${split}.${tgt_lang}
+    if [[ ${tokenize} -eq 1 ]]; then
+        cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        src_file=${src_file}.tok
+        tgt_file=${tgt_file}.tok
+    fi
+    cmd="cat ${src_file}"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="python local/lower_rm.py ${src_file}"
+    fi
+    cmd="${cmd}
+    | spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
+    --output_format=piece
+    > ${src_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="spm_encode
+    --model ${vocab_dir}/${tgt_vocab_prefix}.model
+    --output_format=piece
+    < ${tgt_file}
+    > ${tgt_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    src_file=${src_file}.spm
+    tgt_file=${tgt_file}.spm
+    mkdir -p ${data_dir}/final
+    cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+done
+n_set=${#splits[*]}
+for ((i=0;i<$n_set;i++)); do
+    dataset[$i]=${data_dir}/final/${splits[$i]}
+done
+pref=`echo ${dataset[*]} | sed 's/ /,/g'`
+cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    --source-lang ${src_lang}
+    --target-lang ${tgt_lang}
+    --testpref ${pref}
+    --destdir ${data_dir}/data-bin
+    --srcdict ${vocab_dir}/${src_vocab_prefix}.txt
+    --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
+    --workers 64"
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
--- a/egs/iwslt2022/mt/conf/base.yaml
+++ b/egs/iwslt2022/mt/conf/base.yaml
+arch: transformer
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/mt/conf/basis.yaml
+++ b/egs/iwslt2022/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+max-epoch: 50
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/iwslt2022/mt/conf/dlcl.yaml
+++ b/egs/iwslt2022/mt/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/iwslt2022/mt/conf/rpr.yaml
+++ b/egs/iwslt2022/mt/conf/rpr.yaml
+#encoder-attention-type: rel_selfattn
+encoder-attention-type: relative
+decoder-attention-type: relative
+max-encoder-relative-length: 20
+max-decoder-relative-length: 20
--- a/egs/iwslt2022/mt/conf/small.yaml
+++ b/egs/iwslt2022/mt/conf/small.yaml
+arch: transformer
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/mt/decode.sh
+++ b/egs/iwslt2022/mt/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(test)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+sacrebleu=1
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ -n ${test_subset} ]]; then
+    test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
+    cmd="$cmd --test_subset ${test_subset}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/iwslt2022/mt/local/lower_rm.py
+++ b/egs/iwslt2022/mt/local/lower_rm.py
+import sys
+import string
+in_file = sys.argv[1]
+with open(in_file, "r", encoding="utf-8") as f:
+    for line in f.readlines():
+        line = line.strip().lower()
+        for w in string.punctuation:
+            line = line.replace(w, "")
+        line = line.replace("  ", "")
+        print(line)
--- a/egs/iwslt2022/mt/local/monitor.sh
+++ b/egs/iwslt2022/mt/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/iwslt2022/mt/local/parse_options.sh
+++ b/egs/iwslt2022/mt/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/iwslt2022/mt/local/utils.sh
+++ b/egs/iwslt2022/mt/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/iwslt2022/mt/run.sh
+++ b/egs/iwslt2022/mt/run.sh
+#! /bin/bash
+# Processing MuST-C Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+tgt_lang=de
+lang=${src_lang}-${tgt_lang}
+dataset=mustc
+task=translation
+vocab_type=unigram
+vocab_size=10000
+share_dict=1
+lcrm=0
+tokenizer=0
+use_specific_dict=1
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
+src_vocab_prefix=spm_unigram10000_st_share
+tgt_vocab_prefix=spm_unigram10000_st_share
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/mt
+train_subset=train
+valid_subset=dev
+trans_subset=tst-COMMON
+test_subset=test
+# exp
+exp_prefix=${time}
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=base_s
+# training setting
+fp16=1
+max_tokens=4096
+step_valid=0
+bleu_valid=0
+# decoding setting
+sacrebleu=1
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    exp_prefix=${exp_prefix}_${specific_prefix}
+    data_dir=${data_dir}/${specific_prefix}
+    mkdir -p ${data_dir}
+else
+    if [[ "${vocab_type}" == "char" ]]; then
+        vocab_name=${vocab_type}
+        exp_prefix=${exp_prefix}_${vocab_type}
+    else
+        vocab_name=${vocab_type}${vocab_size}
+    fi
+    data_dir=${data_dir}/${vocab_name}
+    src_vocab_prefix=spm_${vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
+    if [[ $share_dict -eq 1 ]]; then
+        data_dir=${data_dir}_share
+        src_vocab_prefix=spm_${vocab_name}_share
+        tgt_vocab_prefix=spm_${vocab_name}_share
+    fi
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    train_subset=${train_subset}.tok
+    valid_subset=${valid_subset}.tok
+    trans_subset=${trans_subset}.tok
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+. ./local/parse_options.sh || exit 1;
+# full path
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    echo "stage 0: MT Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
+        if [[ ${use_specific_dict} -eq 0 ]]; then
+            cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
+                --data-root ${org_data_dir}
+                --output-root ${data_dir}
+                --splits ${train_subset},${valid_subset},${trans_subset}
+                --src-lang ${src_lang}
+                --tgt-lang ${tgt_lang}
+                --vocab-type ${vocab_type}
+                --vocab-size ${vocab_size}"
+            if [[ $share_dict -eq 1 ]]; then
+                cmd="$cmd
+                --share"
+            fi
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval ${cmd}
+        else
+            cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
+            cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
+        fi
+    fi
+    mkdir -p ${data_dir}/data
+    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
+    {
+        if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
+            txt_dir=${org_data_dir}/data/${split}/txt
+        else
+            txt_dir=${org_data_dir}/data/${split}
+        fi
+        cmd="cat ${txt_dir}/${split}.${src_lang}"
+        if [[ ${lcrm} -eq 1 ]]; then
+            cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
+        fi
+        cmd="${cmd}
+        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
+        --output_format=piece
+        > ${data_dir}/data/${split}.${src_lang}"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        cmd="spm_encode
+        --model ${data_dir}/${tgt_vocab_prefix}.model
+        --output_format=piece
+        < ${txt_dir}/${split}.${tgt_lang}
+        > ${data_dir}/data/${split}.${tgt_lang}"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+    }&
+    done
+    wait
+    cmd="python ${code_dir}/fairseq_cli/preprocess.py
+        --source-lang ${src_lang} --target-lang ${tgt_lang}
+        --trainpref ${data_dir}/data/${train_subset}
+        --validpref ${data_dir}/data/${valid_subset}
+        --testpref ${data_dir}/data/${trans_subset}
+        --destdir ${data_dir}/data-bin
+        --srcdict ${data_dir}/${src_vocab_prefix}.txt
+        --tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
+        --workers 64"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+fi
+data_dir=${data_dir}/data-bin
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: MT Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    fi
+    if [[ $bleu_valid -eq 1 ]]; then
+        cmd="$cmd
+        --eval-bleu
+        --eval-bleu-args '{\"beam\": 1}'
+        --eval-tokenized-bleu
+        --eval-bleu-remove-bpe
+        --best-checkpoint-metric bleu
+        --maximize-best-checkpoint-metric"
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+	fi
+fi
+wait
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: MT Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=(${test_subset//,/ })
+	for subset in ${test_subset[@]}; do
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --gen-subset ${subset}
+        --task ${task}
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}
+        --post-process sentencepiece"
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="${cmd}
+        --tokenizer moses
+        --moses-source-lang ${src_lang}
+        --moses-target-lang ${tgt_lang}"
+            fi
+        fi
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/iwslt2022/mt/train.sh
+++ b/egs/iwslt2022/mt/train.sh
+#! /bin/bash
+# training the model
+gpu_num=1
+update_freq=1
+max_tokens=8192
+exp_tag=baseline
+config_list=(base)
+# exp full name
+exp_name=
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/iwslt2022/st/binary.sh
+++ b/egs/iwslt2022/st/binary.sh
+set -e
+eval=1
+lcrm=1
+tokenizer=0
+vocab_type=unigram
+vocab_size=5000
+use_raw_audio=0
+speed_perturb=0
+dataset=mustc
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/st
+use_specific_dict=0
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
+asr_vocab_prefix=spm_unigram10000_st_share
+src_lang=en
+tgt_lang=zh
+subsets=(2019)
+splits=$(echo ${splits[*]} | sed 's/ /_/g')
+cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+    --data-root ${org_data_dir}
+    --output-root ${data_dir}
+    --task asr
+    --src-lang ${src_lang}
+    --tgt-lang ${tgt_lang}
+    --splits ${splits}
+    --vocab-type ${vocab_type}
+    --vocab-size ${vocab_size}"
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    cmd="$cmd
+    --raw"
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+    cmd="$cmd
+    --asr-prefix ${asr_vocab_prefix}"
+fi
+if [[ ${speed_perturb} -eq 1 ]]; then
+    cmd="$cmd
+    --speed-perturb"
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    cmd="$cmd
+    --lowercase-src
+    --rm-punc-src"
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    cmd="$cmd
+    --tokenizer"
+fi
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/iwslt2022/st/conf/base.yaml
+++ b/egs/iwslt2022/st/conf/base.yaml
+arch: s2t_transformer_s
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 1024
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+attention-dropout: 0.1
+activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/basis.yaml
+++ b/egs/iwslt2022/st/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/conformer.yaml
+++ b/egs/iwslt2022/st/conf/conformer.yaml
+macaron-style: True
+use-cnn-module: True
+cnn-module-kernel: 31
+encoder-attention-type: rel_pos
+encoder-activation-fn: swish
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/ctc.yaml
+++ b/egs/iwslt2022/st/conf/ctc.yaml
+ctc-weight: 0.3
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/dlcl.yaml
+++ b/egs/iwslt2022/st/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/iwslt2022/st/conf/inter.yaml
+++ b/egs/iwslt2022/st/conf/inter.yaml
+ctc-weight: 0.2
+intermedia-ctc-layers: 6,9
+intermedia-adapter: league
+intermedia-ctc-weight: 0.1
+ctc-self-distill-weight: 0
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/local_attn.yaml
+++ b/egs/iwslt2022/st/conf/local_attn.yaml
+encoder-attention-type: local
+hard-mask-window: 0
+gauss-mask-sigma: 3
+init-mask-weight: 0
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/pds_base.yaml
+++ b/egs/iwslt2022/st/conf/pds_base.yaml
+arch: pdss2t_transformer_s_8
+pds-fusion: True
+ctc-layer: 12
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/pds_base_16.yaml
+++ b/egs/iwslt2022/st/conf/pds_base_16.yaml
+arch: pdss2t_transformer_s_16
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/pds_base_32.yaml
+++ b/egs/iwslt2022/st/conf/pds_base_32.yaml
+arch: pdss2t_transformer_s_32
+encoder-embed-dim: 256
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/pds_base_8.yaml
+++ b/egs/iwslt2022/st/conf/pds_base_8.yaml
+arch: pdss2t_transformer_s_8
+pds-ctc: 1_1_1_1
+intermedia-adapter: league
+intermedia-ctc-weight: 0.15
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/rpr.yaml
+++ b/egs/iwslt2022/st/conf/rpr.yaml
+encoder-attention-type: rel_selfattn
+#encoder-attention-type: relative
+#decoder-attention-type: relative
+#max-encoder-relative-length: 100
+#max-decoder-relative-length: 20
--- a/egs/iwslt2022/st/conf/sate.yaml
+++ b/egs/iwslt2022/st/conf/sate.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 1024
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+acoustic-encoder: transformer
+adapter: league
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/sate_big.yaml
+++ b/egs/iwslt2022/st/conf/sate_big.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 1e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 2048
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.15
+activation-fn: relu
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+acoustic-encoder: transformer
+adapter: league
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/sate_big_pds.yaml
+++ b/egs/iwslt2022/st/conf/sate_big_pds.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 1e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+subsampling-type: conv1d
+subsmapling-layers: 2
+subsampling-filter: 2048
+subsampling-kernel: 5
+subsampling-stride: 2
+subsampling-norm: none
+subsampling-activation: glu
+dropout: 0.15
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+acoustic-encoder: pds
+adapter: league
+encoder-embed-dim: 512
+ctc-layer: 12
+pds-stages: 4
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/conf/sate_pds.yaml
+++ b/egs/iwslt2022/st/conf/sate_pds.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+acoustic-encoder: pds
+adapter: league
+encoder-embed-dim: 256
+ctc-layer: 12
+pds-stages: 4
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt2022/st/decode.sh
+++ b/egs/iwslt2022/st/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(dev tst-COMMON)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+sacrebleu=1
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ ${#test_subset[@]} -ne 0 ]]; then
+    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
+    cmd="$cmd --test_subset ${subsets}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/iwslt2022/st/ensemble.sh
+++ b/egs/iwslt2022/st/ensemble.sh
+set -e
+gpu_num=1
+root_dir=/home/xuchen/st/Fairseq-S2T
+ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
+model_txt=$1
+set=$2
+test_subset=$3
+#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
+#test_subset=(tst-COMMON)
+data_dir=/media/data/tst/$set/en-de
+#test_subset=(office)
+#test_subset=(webrtc1)
+#test_subset=(adap2)
+data_config=config_st_share.yaml
+result_file=./result
+beam_size=5
+lenpen=0.6
+max_tokens=10000
+models=()
+i=0
+for line in `cat $model_txt`; do
+    i=`expr $i + 1`
+    model_dir=$ckpt/$line
+    [[ ! -d $model_dir ]] && echo $model_dir && exit 1;
+    if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
+        model=$model_dir/avg_10_checkpoint.pt
+    else
+        model=$model_dir/checkpoint_best.pt
+    fi
+    [[ ! -f $model ]] && echo $model && exit 1;
+    models[$i]=$model
+done
+models=`echo ${models[*]} | sed 's/ /:/g'`
+res_dir=$ckpt/ensemble/$set
+i=0
+while : 
+do
+    if [[ -d $res_dir/$i ]]; then
+        i=`expr $i + 1`
+    else
+        res_dir=$res_dir/$i
+        break
+    fi 
+done
+mkdir -p $res_dir
+cp $model_txt $res_dir
+if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+    if [[ ${gpu_num} -eq 0 ]]; then
+        device=()
+    else
+        source ./local/utils.sh
+        device=$(get_devices $gpu_num 0)
+    fi
+fi
+export CUDA_VISIBLE_DEVICES=${device}
+for subset in ${test_subset[@]}; do
+    subset=${subset}_st
+    cmd="python ${root_dir}/fairseq_cli/generate.py
+    ${data_dir}
+    --config-yaml ${data_config}
+    --gen-subset ${subset}
+    --task speech_to_text
+    --path ${models}
+    --results-path ${res_dir}
+    --skip-invalid-size-inputs-valid-test
+    --max-tokens ${max_tokens}
+    --beam ${beam_size}
+    --lenpen ${lenpen}
+    --scoring sacrebleu"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    eval $cmd
+    tail -n 1 ${res_dir}/generate-${subset}.txt
+    cd $res_dir
+    evaluate.sh translation-${subset}.txt $set
+    cd -
+done
--- a/egs/iwslt2022/st/local/monitor.sh
+++ b/egs/iwslt2022/st/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/iwslt2022/st/local/parse_options.sh
+++ b/egs/iwslt2022/st/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/iwslt2022/st/local/utils.sh
+++ b/egs/iwslt2022/st/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/iwslt2022/st/run.sh
+++ b/egs/iwslt2022/st/run.sh
+#! /bin/bash
+# Processing MuST-C Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d_%H%M")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+#device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+tgt_lang=de
+lang=${src_lang}-${tgt_lang}
+dataset=mustc
+task=speech_to_text
+vocab_type=unigram
+asr_vocab_size=5000
+vocab_size=10000
+share_dict=1
+speed_perturb=0
+lcrm=0
+tokenizer=0
+use_raw_audio=0
+use_specific_dict=0
+specific_prefix=valid
+specific_dir=${root_dir}/data/mustc/st
+asr_vocab_prefix=spm_unigram10000_st_share
+st_vocab_prefix=spm_unigram10000_st_share
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/st
+train_split=train
+valid_split=dev
+test_split=tst-COMMON
+test_subset=tst-COMMON
+# exp
+exp_prefix=$(date "+%m%d")
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=base,ctc
+# training setting
+fp16=1
+max_tokens=40000
+step_valid=0
+bleu_valid=0
+# decoding setting
+sacrebleu=1
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+if [[ ${share_dict} -eq 1 ]]; then
+	data_config=config_share.yaml
+else
+	data_config=config.yaml
+fi
+if [[ ${speed_perturb} -eq 1 ]]; then
+    data_dir=${data_dir}_sp
+    exp_prefix=${exp_prefix}_sp
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    data_dir=${data_dir}_${specific_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    data_dir=${data_dir}_raw
+    exp_prefix=${exp_prefix}_raw
+fi
+. ./local/parse_options.sh || exit 1;
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: ASR Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    feature_zip=fbank80.zip
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
+    fi
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
+        ln -s ${data_dir}/../feature_zip ${data_dir}
+    fi
+    # create ASR vocabulary if necessary
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+        --data-root ${org_data_dir}
+        --output-root ${data_dir}/asr4st
+        --task asr
+        --raw
+        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
+        --vocab-type ${vocab_type}
+        --vocab-size ${asr_vocab_size}"
+    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
+    asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
+    echo "stage 0: ST Data Preparation"
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+        --data-root ${org_data_dir}
+        --output-root ${data_dir}
+        --task st
+        --add-src
+        --src-lang ${src_lang}
+        --tgt-lang ${tgt_lang}
+        --splits ${valid_split},${test_split},${train_split}
+        --cmvn-type utterance
+        --vocab-type ${vocab_type}
+        --vocab-size ${vocab_size}"
+    if [[ ${use_raw_audio} -eq 1 ]]; then
+        cmd="$cmd
+        --raw"
+    fi
+    if [[ ${use_specific_dict} -eq 1 ]]; then
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
+        if [[ $share_dict -eq 1 ]]; then
+            cmd="$cmd
+        --share
+        --st-spm-prefix ${st_vocab_prefix}"
+        else
+            cmd="$cmd
+        --st-spm-prefix ${st_vocab_prefix}
+        --asr-prefix ${asr_vocab_prefix}"
+        fi
+    else
+        if [[ $share_dict -eq 1 ]]; then
+            cmd="$cmd
+        --share"
+        else
+            cmd="$cmd
+        --asr-prefix ${asr_prefix}"
+        fi
+    fi
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        cmd="$cmd
+        --speed-perturb"
+    fi
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="$cmd
+        --lowercase-src
+        --rm-punc-src"
+    fi
+    if [[ ${tokenizer} -eq 1 ]]; then
+        cmd="$cmd
+        --tokenizer"
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
+    fi
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: ST Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        keep_last_epochs=10
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    else
+        validate_interval=1
+        keep_last_epochs=10
+    fi
+    if [[ $bleu_valid -eq 1 ]]; then
+        cmd="$cmd
+        --eval-bleu
+        --eval-bleu-args '{\"beam\": 1}'
+        --eval-tokenized-bleu
+        --eval-bleu-remove-bpe
+        --best-checkpoint-metric bleu
+        --maximize-best-checkpoint-metric"
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $keep_last_epochs ]]; then
+        cmd="${cmd}
+        --keep-last-epochs $keep_last_epochs "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+	fi
+fi
+wait
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: ST Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=${test_subset//,/ }
+	for subset in ${test_subset[@]}; do
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --gen-subset ${subset}
+        --task speech_to_text
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}"
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+            cmd="${cmd}
+        --tokenizer moses
+        --moses-source-lang ${src_lang}
+        --moses-target-lang ${tgt_lang}"
+            fi
+        fi
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/iwslt2022/st/train.sh
+++ b/egs/iwslt2022/st/train.sh
+#! /bin/bash
+# training the model
+gpu_num=8
+update_freq=1
+max_tokens=40000
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+exp_tag=
+#config_list=(base)
+config_list=(ctc)
+#config_list=(sate_ctc)
+#config_list=(ctc conformer rpr)
+#config_list=(base sate)
+#config_list=(pds_base)
+#config_list=(pds_base conformer)
+# exp full name
+exp_name=
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/libri_trans/asr/conf/debug.yaml
+++ b/egs/libri_trans/asr/conf/debug.yaml
@@ -5,6 +5,7 @@ clip-norm: 10.0
 lr-scheduler: inverse_sqrt
 warmup-init-lr: 1e-7
 warmup-updates: 10000
+weight-decay: 1e-6
 lr: 2e-3
 adam_betas: (0.9,0.98)
@@ -13,12 +14,21 @@ label_smoothing: 0.1
 subsampling-type: conv1d
 subsmapling-layers: 2
-subsampling-filter: 512
+subsampling-filter: 1024
 subsampling-kernel: 5
 subsampling-stride: 2
 subsampling-norm: none
 subsampling-activation: glu
+ctc-weight: 0.2
+intermedia-ctc-layers: 6,9
+intermedia-adapter: league
+intermedia-ctc-weight: 0.1
+intermedia-drop-prob: 0.5
+ctc-self-distill-weight: 0
+post-process: sentencepiece
 dropout: 0.1
 activation-fn: relu
 encoder-embed-dim: 256
@@ -33,8 +43,5 @@ decoder-attention-heads: 4
 attention-dropout: 0.1
 activation-dropout: 0.1
-macaron-style: True
+#load-pretrained-encoder-from:
-use-cnn-module: True
+#load-pretrained-decoder-from:
-cnn-module-kernel: 31
\ No newline at end of file
-encoder-activation-fn: swish
-encoder-attention-type: rel_pos_legacy
--- a/egs/mustc/asr/binary.sh
+++ b/egs/mustc/asr/binary.sh
@@ -2,43 +2,61 @@ set -e
 eval=1
-lcrm=0
+lcrm=1
 tokenizer=0
-root_dir=~/st/Fairseq-S2T
+vocab_type=unigram
-data_dir=~/st/data/test
+vocab_size=5000
-vocab_dir=~/st/data/mustc/st/en-de
+use_raw_audio=0
+speed_perturb=0
+dataset=mustc
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
+use_specific_dict=0
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share
 src_lang=en
-tgt_lang=de
+tgt_lang=zh
 subsets=(2019)
-cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
+splits=$(echo ${subsets[*]} | sed 's/ /_/g')
-rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
+cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+    --data-root ${org_data_dir}
-splits=$(echo ${subsets[*]} | sed 's/ /,/g')
-cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-    --data-root ${data_dir}
    --output-root ${data_dir}
-    --splits ${splits}
    --task asr
    --src-lang ${src_lang}
-    --tgt-lang ${tgt_lang}
+    --splits ${splits}
-    --add-src
+    --vocab-type ${vocab_type}
-    --share
+    --vocab-size ${vocab_size}"
-    --asr-prefix ${asr_vocab_prefix}
-    --cmvn-type utterance"
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    cmd="$cmd
-    if [[ ${lcrm} -eq 1 ]]; then
+    --raw"
-        cmd="$cmd
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+    cmd="$cmd
+    --asr-prefix ${asr_vocab_prefix}"
+fi
+if [[ ${speed_perturb} -eq 1 ]]; then
+    cmd="$cmd
+    --speed-perturb"
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    cmd="$cmd
    --lowercase-src
    --rm-punc-src"
-    fi
+fi
-    if [[ ${tokenizer} -eq 1 ]]; then
+if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
+    cmd="$cmd
    --tokenizer"
-    fi
+fi
 echo -e "\033[34mRun command: \n${cmd} \033[0m"
 [[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/mustc/st/binary.sh
+++ b/egs/mustc/st/binary.sh
@@ -5,44 +5,59 @@ eval=1
 lcrm=1
 tokenizer=0
-root_dir=~/st/Fairseq-S2T
+vocab_type=unigram
-data_dir=/home/xuchen/st/data/test
+vocab_size=5000
-vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
+use_raw_audio=0
+speed_perturb=0
+dataset=mustc
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/st
+use_specific_dict=0
+specific_prefix=st
+specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
 src_lang=en
-tgt_lang=de
+tgt_lang=zh
-splits=(2019)
+subsets=(2019)
-splits=$(echo ${splits[*]} | sed 's/ /_/g')
+splits=$(echo ${subsets[*]} | sed 's/ /_/g')
+cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
-cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
+    --data-root ${org_data_dir}
-cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
-rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
-cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
-    --data-root ${data_dir}
    --output-root ${data_dir}
-    --splits ${splits}
+    --task asr
-    --task st
    --src-lang ${src_lang}
    --tgt-lang ${tgt_lang}
-    --add-src
+    --splits ${splits}
-    --share
+    --vocab-type ${vocab_type}
-    --asr-prefix ${asr_vocab_prefix}
+    --vocab-size ${vocab_size}"
-    --st-spm-prefix ${st_vocab_prefix}
-    --cmvn-type utterance"
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    cmd="$cmd
-    if [[ ${lcrm} -eq 1 ]]; then
+    --raw"
-        cmd="$cmd
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+    cmd="$cmd
+    --asr-prefix ${asr_vocab_prefix}"
+fi
+if [[ ${speed_perturb} -eq 1 ]]; then
+    cmd="$cmd
+    --speed-perturb"
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    cmd="$cmd
    --lowercase-src
    --rm-punc-src"
-    fi
+fi
-    if [[ ${tokenizer} -eq 1 ]]; then
+if [[ ${tokenizer} -eq 1 ]]; then
-        cmd="$cmd
+    cmd="$cmd
    --tokenizer"
-    fi
+fi
 echo -e "\033[34mRun command: \n${cmd} \033[0m"
 [[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -129,11 +129,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
    fi
-    if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
+    feature_zip=fbank80.zip
-        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
+        ln -s ${data_dir}/../feature_zip ${data_dir}
    fi
    # create ASR vocabulary if necessary
@@ -204,13 +205,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval ${cmd}
-    if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
-        mv ${data_dir}/fbank80.zip ${data_dir}/..
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
-        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
-    fi
-    if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
-        mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
-        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
    fi
 fi

--- a/examples/speech_to_text/data_utils.py
+++ b/examples/speech_to_text/data_utils.py
@@ -26,7 +26,8 @@ PAD_TOKEN, PAD_TOKEN_ID = "<pad>", 1
 def gen_vocab(
    input_path: Path, output_path_prefix: Path, model_type="bpe",
-    vocab_size=1000, special_symbols: Optional[List[str]] = None
+    vocab_size=1000, special_symbols: Optional[List[str]] = None,
+    normalization_rule_name=None
 ):
    # Train SentencePiece Model
    arguments = [
@@ -43,6 +44,8 @@ def gen_vocab(
        f"--eos_id={EOS_TOKEN_ID}",
        f"--pad_id={PAD_TOKEN_ID}",
    ]
+    if normalization_rule_name is not None:
+        arguments.append(f"--normalization_rule_name={normalization_rule_name}")
    if special_symbols is not None:
        _special_symbols = ",".join(special_symbols)
        arguments.append(f"--user_defined_symbols={_special_symbols}")

--- a/examples/speech_to_text/prep_audio_data.py
+++ b/examples/speech_to_text/prep_audio_data.py
@@ -68,10 +68,6 @@ class AudioDataset(Dataset):
                if 0 < self.size < total_length:
                    segments = segments[:self.size]
-                # for idx, seg in enumerate(content):
-                #     segments[idx] = seg
-                #     if 0 < self.size < idx:
-                #         break
        else:
            self.mode = "easy"
@@ -141,7 +137,7 @@ class AudioDataset(Dataset):
                for i, segment in enumerate(seg_group):
                    offset = int(float(segment["offset"]) * sample_rate)
                    n_frames = int(float(segment["duration"]) * sample_rate)
-                    _id = f"{wav_path.stem}_{i}"
+                    _id = f"{split}_{wav_path.stem}_{i}"
                    item = dict()
                    item["audio"] = wav_path.as_posix()
@@ -240,7 +236,7 @@ def process(args):
        if not Path.exists(zip_path) or args.overwrite:
            gen_feature_flag = True
-    if gen_feature_flag:
+    if True and gen_feature_flag:
        if args.speed_perturb:
            feature_root = output_root / "fbank80_sp"
        else:
@@ -265,7 +261,7 @@ def process(args):
            for idx in tqdm(range(len(dataset))):
                item = dataset[idx]
-                utt_id = item["id"]
+                utt_id = item['id']
                features_path = (feature_root / f"{utt_id}.npy").as_posix()
                if os.path.exists(features_path):
@@ -291,7 +287,7 @@ def process(args):
        create_zip(feature_root, zip_path)
        # Clean up
-        shutil.rmtree(feature_root)
+        # shutil.rmtree(feature_root)
    gen_manifest_flag = False
    for split in splits:

--- a/examples/speech_to_text/prep_mt_data.py
+++ b/examples/speech_to_text/prep_mt_data.py
@@ -115,6 +115,7 @@ def process(args):
            output_root / tgt_spm_filename_prefix,
            args.vocab_type,
            args.vocab_size,
+            normalization_rule_name="identity" if tgt_lang == "zh" else None
        )
    if not args.share:
@@ -126,6 +127,7 @@ def process(args):
                output_root / src_spm_filename_prefix,
                args.vocab_type,
                args.vocab_size,
+                normalization_rule_name="identity" if tgt_lang == "zh" else None
            )
    # Generate config YAML

--- a/fairseq/criterions/ctc.py
+++ b/fairseq/criterions/ctc.py
@@ -235,8 +235,8 @@ class CtcCriterion(FairseqCriterion):
                ctc_self_distill_num += 1
                loss = F.kl_div(
-                    F.log_softmax(inter_ctc_logit, dim=-1),
+                    F.log_softmax(inter_ctc_logit, dim=-1, dtype=torch.float32),
-                    F.softmax(ctc_logit, dim=-1),
+                    F.softmax(ctc_logit, dim=-1, dtype=torch.float32),
                    reduction="none",
                )
                loss = loss.sum(-1).transpose(0, 1).masked_fill_(~non_padding_mask, 0.0)

--- a/fairseq/data/audio/speech_to_text_dataset.py
+++ b/fairseq/data/audio/speech_to_text_dataset.py
@@ -360,6 +360,7 @@ class SpeechToTextDataset(FairseqDataset):
                target = torch.cat((torch.LongTensor([lang_tag_idx]), target), 0)
        transcript = None
        if self.src_dict is not None and self.src_texts is not None and self.src_bpe_tokenizer is not None:
            tokenized = self.tokenize_text(self.src_texts[index], True)
            transcript = self.src_dict.encode_line(
@@ -443,6 +444,7 @@ class SpeechToTextDataset(FairseqDataset):
            "ntokens": ntokens,
            "nsentences": len(samples),
        }
        return out
    def num_tokens(self, index):

--- a/fairseq/models/speech_to_text/modules/adapter.py
+++ b/fairseq/models/speech_to_text/modules/adapter.py
@@ -49,14 +49,16 @@ class CTCCompressStrategy:
            for t_idx, same in enumerate(pred):
                new_processed_inputs_cnt = processed_inputs_cnt + same[1]
                # Get the probabilities of the prediction for the different time steps as weight
-                weights = F.softmax(prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]])
+                weights = F.softmax(
+                    prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]], dtype=torch.float32
+                )
                weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
                    weights / weights.sum()
                processed_inputs_cnt = new_processed_inputs_cnt
        return weights_matrix
-class InterAdapter(nn.Module):
+class Adapter(nn.Module):
    def __init__(self, dim, adapter_type, dictionary, embed_tokens=None, strategy=None):
        super().__init__()
@@ -84,6 +86,7 @@ class InterAdapter(nn.Module):
            self.gate_linear2 = nn.Linear(dim, dim)
        if self.adapter_type == "shrink":
+            assert strategy is not None
            self.ctc_compress = getattr(CTCCompressStrategy, strategy)
            logger.info("CTC Compress Strategy: %s" % strategy)
        elif self.adapter_type == "league":
@@ -94,7 +97,7 @@ class InterAdapter(nn.Module):
    def forward(self, x, padding):
        representation, distribution = x
-        dim1, dim2, dim = representation.size()
+        seq_len, bsz, dim = representation.size()
        org_distribution = distribution
        lengths = (~padding).long().sum(-1)
@@ -103,7 +106,9 @@ class InterAdapter(nn.Module):
        elif self.adapter_type == "context":
            distribution = distribution.view(-1, distribution.size(-1))
-            out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1)
+            out = torch.mm(
+                distribution, self.embed_adapter.weight.float()
+            ).view(seq_len, bsz, -1).type_as(representation)
        elif self.adapter_type == "league":
            linear_out = self.linear_adapter(representation)
@@ -112,19 +117,25 @@ class InterAdapter(nn.Module):
                threshold = distribution.sort(dim=-1, descending=True)[0][:, :, cutoff:cutoff+1]
                distribution = torch.where(distribution > threshold, distribution, torch.zeros_like(distribution))
            distribution = distribution.view(-1, distribution.size(-1))
-            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1)
+            soft_out = torch.mm(
+                distribution, self.embed_adapter.weight.float()
+            ).view(seq_len, bsz, -1).type_as(representation)
            out = linear_out + soft_out
        elif self.adapter_type == "gated_league":
            linear_out = self.linear_adapter(representation)
            distribution = distribution.view(-1, distribution.size(-1))
-            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1)
+            soft_out = torch.mm(
+                distribution, self.embed_adapter.weight.float()
+            ).view(seq_len, bsz, -1).type_as(representation)
            coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
            out = coef * linear_out + (1 - coef) * soft_out
        elif self.adapter_type == "inter_league":
            distribution = distribution.view(-1, distribution.size(-1))
-            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(dim1, dim2, -1)
+            soft_out = torch.mm(
+                distribution, self.embed_adapter.weight.float()
+            ).view(seq_len, bsz, -1).type_as(representation)
            out = representation + soft_out
        elif self.adapter_type == "none":
@@ -142,10 +153,11 @@ class InterAdapter(nn.Module):
                new_lengths = [len(p) for p in batch_predicted]
                weights_matrix = self.ctc_compress(prob_ctc, batch_predicted, new_lengths,
-                                                   representation.dtype, representation.device)
+                                                   prob_ctc.dtype, prob_ctc.device)
            # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
-            compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix)  # B x C x T'
+            representation = representation.permute(1, 2, 0)
+            compressed_output = representation.float().bmm(weights_matrix).type_as(representation)  # B x C x T'
            out = compressed_output.permute(2, 0, 1)
            out_lengths = lengths.new(new_lengths)

--- a/fairseq/models/speech_to_text/modules/ctc.py
+++ b/fairseq/models/speech_to_text/modules/ctc.py
@@ -38,10 +38,10 @@ class CTC(nn.Module):
        return x
    def softmax(self, x, temperature=1.0):
-        return F.softmax(self.ctc_projection(x) / temperature, dim=-1)
+        return F.softmax(self.ctc_projection(x) / temperature, dim=-1, dtype=torch.float32)
    def log_softmax(self, x, temperature=1.0):
-        return F.log_softmax(self.ctc_projection(x) / temperature, dim=-1)
+        return F.log_softmax(self.ctc_projection(x) / temperature, dim=-1, dtype=torch.float32)
    def argmax(self, x):
        return torch.argmax(self.ctc_projection(x), dim=-1)

--- a/fairseq/models/speech_to_text/pdss2t_transformer.py
+++ b/fairseq/models/speech_to_text/pdss2t_transformer.py
@@ -13,7 +13,7 @@ from fairseq.models import (
    register_model_architecture,
 )
 from fairseq.models.speech_to_text import S2TTransformerModel
-from fairseq.models.speech_to_text.modules import CTC, InterAdapter
+from fairseq.models.speech_to_text.modules import CTC, Adapter
 from fairseq.modules import (
    FairseqDropout,
@@ -673,14 +673,14 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                        strategy = None
                        if args.intermedia_adapter == "shrink":
                            strategy = getattr(args, "ctc_compress_strategy", "avg")
-                        adapter = InterAdapter(embed_dim, args.intermedia_adapter,
+                        adapter = Adapter(embed_dim, args.intermedia_adapter,
-                                               task.source_dictionary, strategy=strategy)
+                                          task.source_dictionary, strategy=strategy)
                        inter_adapter = adapter
                    else:
                        adapter = inter_adapter
                else:
-                    adapter = InterAdapter(embed_dim, "none",
+                    adapter = Adapter(embed_dim, "none",
-                                           task.source_dictionary)
+                                      task.source_dictionary)
            else:
                ctc = None
                adapter = None
@@ -830,7 +830,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                logit = ctc(x.clone())
                intermedia_ctc_logits.append([logit, encoder_padding_mask])
-                prob = F.softmax(logit, dim=-1)
+                prob = utils.softmax(logit, dim=-1)
                x, encoder_padding_mask = adapter([x, prob], encoder_padding_mask)
        if self.fusion_stages_num != 0:

--- a/fairseq/models/speech_to_text/s2t_ctc.py
+++ b/fairseq/models/speech_to_text/s2t_ctc.py
@@ -14,7 +14,7 @@ from fairseq.models import (
    register_model,
    register_model_architecture,
 )
-from fairseq.models.speech_to_text.modules import InterAdapter, CTC
+from fairseq.models.speech_to_text.modules import Adapter, CTC
 from fairseq.modules import (
    FairseqDropout,
    LayerNorm,
@@ -382,6 +382,12 @@ class S2TCTCModel(FairseqEncoderModel):
            type=int,
            help="cutoff of the distribution",
        )
+        parser.add_argument(
+            "--intermedia-drop-prob",
+            default=0,
+            type=float,
+            help="probability of dropping the followed layers",
+        )
        pass
    @classmethod
@@ -424,9 +430,9 @@ class S2TCTCModel(FairseqEncoderModel):
        else:
            logits = net_output["ctc_logit"][0]
        if log_probs:
-            return utils.log_softmax(logits.float(), dim=-1)
+            return utils.log_softmax(logits, dim=-1)
        else:
-            return utils.softmax(logits.float(), dim=-1)
+            return utils.softmax(logits, dim=-1)
    def forward(self, src_tokens, src_lengths, prev_output_tokens=None):
        """
@@ -513,11 +519,12 @@ class S2TCTCEncoder(FairseqEncoder):
            strategy = None
            if args.intermedia_adapter == "shrink":
-                strategy = getattr(args, "ctc_compress_strategy", None)
+                strategy = getattr(args, "ctc_compress_strategy", "avg")
            elif args.intermedia_adapter == "league":
                strategy = getattr(args, "intermedia_distribution_cutoff", -1)
-            self.adapter = InterAdapter(dim, args.intermedia_adapter,
+            self.adapter = Adapter(dim, args.intermedia_adapter,
-                                        task.source_dictionary, strategy=strategy)
+                                   task.source_dictionary, strategy=strategy)
+            self.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
    def add_to_dict(self, x, dis, idx):
        sim = 0
@@ -595,11 +602,16 @@ class S2TCTCEncoder(FairseqEncoder):
            # interleave CTC
            if layer_idx in self.intermedia_ctc_layers:
+                if self.intermedia_drop_prob > 0:
+                    p = torch.rand(1).uniform_()
+                    if p < self.intermedia_drop_prob:
+                        break
                norm_x = self.layer_norm(x)
                logit = self.ctc(norm_x)
                intermedia_ctc_logits.append(logit)
-                prob = F.softmax(logit, dim=-1)
+                prob = F.softmax(logit, dim=-1, dtype=torch.float32)
                x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask)
            # gather cosine similarity
@@ -702,7 +714,7 @@ class CTCDecoder(object):
            model_path=self.lm_model,
            alpha=self.lm_weight,
            beta=0,
-            cutoff_top_n=self.vocab_size,
+            cutoff_top_n=40,
            cutoff_prob=1.0,
            beam_width=self.beam_size,
            num_processes=20,
@@ -725,7 +737,9 @@ class CTCDecoder(object):
                                  src_lengths=src_lengths)
        ctc_logit = encoder_outs["ctc_logit"][0].transpose(0, 1)
-        beam_results, beam_scores, timesteps, out_lens = self.ctc_decoder.decode(F.softmax(ctc_logit, -1), src_lengths)
+        beam_results, beam_scores, time_steps, out_lens = self.ctc_decoder.decode(
+            utils.softmax(ctc_logit, -1), src_lengths
+        )
        finalized = []
        for idx in range(bsz):

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -18,7 +18,7 @@ from fairseq.models.speech_to_text import (
    PDSS2TTransformerModel,
    PDSS2TTransformerEncoder,
 )
-from fairseq.models.speech_to_text.modules import CTCCompressStrategy
+from fairseq.models.speech_to_text.modules import CTCCompressStrategy, Adapter
 from fairseq.modules import (
    FairseqDropout,
    LayerNorm,
@@ -140,95 +140,103 @@ class S2TSATEModel(S2TTransformerModel):
        return encoder
-class Adapter(nn.Module):
+# class Adapter(nn.Module):
-    def __init__(self, args, dictionary, embed_tokens):
+#     def __init__(self, args, dictionary, embed_tokens):
-        super().__init__()
+#         super().__init__()
+#
-        embed_dim = args.encoder_embed_dim
+#         embed_dim = args.encoder_embed_dim
+#
-        self.adapter_type = args.adapter
+#         self.adapter_type = args.adapter
-        if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]:
+#         if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]:
-            self.linear_adapter = nn.Sequential(
+#             self.linear_adapter = nn.Sequential(
-                nn.Linear(embed_dim, embed_dim),
+#                 nn.Linear(embed_dim, embed_dim),
-                LayerNorm(args.encoder_embed_dim),
+#                 LayerNorm(args.encoder_embed_dim),
-                nn.ReLU(),
+#                 nn.ReLU(),
-            )
+#             )
-        elif self.adapter_type == "linear2":
+#         elif self.adapter_type == "linear2":
-            self.linear_adapter = nn.Sequential(
+#             self.linear_adapter = nn.Sequential(
-                nn.Linear(embed_dim, embed_dim),
+#                 nn.Linear(embed_dim, embed_dim),
-            )
+#             )
+#
-        if self.adapter_type in ["embed", "context", "league", "gated_league", "gated_league2"]:
+#         if self.adapter_type in ["embed", "context", "league", "gated_league", "gated_league2"]:
-            if embed_tokens is None:
+#             if embed_tokens is None:
-                num_embeddings = len(dictionary)
+#                 num_embeddings = len(dictionary)
-                self.embed_adapter = Embedding(num_embeddings, embed_dim, dictionary.pad())
+#                 self.embed_adapter = Embedding(num_embeddings, embed_dim, dictionary.pad())
-            else:
+#             else:
-                self.embed_adapter = embed_tokens
+#                 self.embed_adapter = embed_tokens
+#
-        if self.adapter_type == "gated_league":
+#         if self.adapter_type == "gated_league":
-            self.gate_linear = nn.Linear(2 * embed_dim, embed_dim)
+#             self.gate_linear = nn.Linear(2 * embed_dim, embed_dim)
-        elif self.adapter_type == "gated_league2":
+#         elif self.adapter_type == "gated_league2":
-            self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
+#             self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
-            self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
+#             self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
+#
-        if self.adapter_type == "shrink":
+#         if self.adapter_type == "shrink":
-            self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
+#             self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
+#
-    def forward(self, x, padding):
+#     def forward(self, x, padding):
+#
-        representation, distribution = x
+#         representation, distribution = x
-        batch, seq_len, embed_dim = representation.size()
+#         batch, seq_len, embed_dim = representation.size()
-        org_distribution = distribution
+#         org_distribution = distribution
-        if distribution is not None:
+#         if distribution is not None:
-            distribution = distribution.view(-1, distribution.size(-1))
+#             distribution = distribution.view(-1, distribution.size(-1))
-        lengths = (~padding).long().sum(-1)
+#         lengths = (~padding).long().sum(-1)
+#
-        if self.adapter_type == "linear":
+#         if self.adapter_type == "linear":
-            out = self.linear_adapter(representation)
+#             out = self.linear_adapter(representation)
+#
-        elif self.adapter_type == "context":
+#         elif self.adapter_type == "context":
-            out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+#             out = torch.mm(
+#                 distribution, self.embed_adapter.weight.float()
-        elif self.adapter_type == "league":
+#             ).view(batch, seq_len, -1).type_as(representation)
-            linear_out = self.linear_adapter(representation)
+#
-            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+#         elif self.adapter_type == "league":
-            out = linear_out + soft_out
+#             linear_out = self.linear_adapter(representation)
+#             soft_out = torch.mm(
-        elif self.adapter_type == "gated_league":
+#                 distribution, self.embed_adapter.weight.float()
-            linear_out = self.linear_adapter(representation)
+#             ).view(batch, seq_len, -1).type_as(linear_out)
-            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+#             out = linear_out + soft_out
-            coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
+#
-            out = coef * linear_out + (1 - coef) * soft_out
+#         elif self.adapter_type == "gated_league":
+#             linear_out = self.linear_adapter(representation)
-        elif self.adapter_type == "none":
+#             soft_out = torch.mm(
-            out = representation
+#                 distribution, self.embed_adapter.weight.float()
+#             ).view(batch, seq_len, -1).type_as(linear_out)
-        elif self.adapter_type == "shrink":
+#             coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
-            from itertools import groupby
+#             out = coef * linear_out + (1 - coef) * soft_out
+#
-            with torch.no_grad():
+#         elif self.adapter_type == "none":
-                batch_predicted = []
+#             out = representation
-                prob_ctc = org_distribution.transpose(0, 1)  # T x B x D -> B x T x D
+#
-                for b in range(prob_ctc.shape[0]):
+#         elif self.adapter_type == "shrink":
-                    predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist()
+#             from itertools import groupby
-                    batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
+#
+#             with torch.no_grad():
-                new_lengths = [len(p) for p in batch_predicted]
+#                 batch_predicted = []
-                weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths,
+#                 prob_ctc = org_distribution.transpose(0, 1)  # T x B x D -> B x T x D
-                                                          representation.dtype, representation.device)
+#                 for b in range(prob_ctc.shape[0]):
+#                     predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist()
-            # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
+#                     batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
-            compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix)  # B x C x T'
+#
-            out = compressed_output.permute(2, 0, 1)
+#                 new_lengths = [len(p) for p in batch_predicted]
+#                 weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths,
-            out_lengths = lengths.new(new_lengths)
+#                                                           prob_ctc.dtype, prob_ctc.device)
-            padding = lengths_to_padding_mask(out_lengths)
+#
+#             # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
-        else:
+#             data_type = representation.dtype
-            out = None
+#             representation = representation.permute(1, 2, 0).float()
-            logging.error("Unsupported adapter type: {}.".format(self.adapter_type))
+#             compressed_output = representation.bmm(weights_matrix).type_as(data_type)  # B x C x T'
+#             out = compressed_output.permute(2, 0, 1)
-        return out, padding
+#
+#             out_lengths = lengths.new(new_lengths)
+#             padding = lengths_to_padding_mask(out_lengths)
+#
+#         else:
+#             out = None
+#             logging.error("Unsupported adapter type: {}.".format(self.adapter_type))
+#
+#         return out, padding
 class TextEncoder(FairseqEncoder):
@@ -301,7 +309,18 @@ class S2TSATEEncoder(FairseqEncoder):
        # adapter
        self.temperature = args.temperature
-        self.adapter = Adapter(args, task.source_dictionary, embed_tokens)
+        # self.adapter = Adapter(args, task.source_dictionary, embed_tokens)
+        strategy = None
+        if args.adapter == "shrink":
+            strategy = getattr(args, "ctc_compress_strategy", "avg")
+        elif args.adapter == "league":
+            strategy = getattr(args, "intermedia_distribution_cutoff", -1)
+        self.adapter = Adapter(args.encoder_embed_dim,
+                               args.adapter,
+                               task.source_dictionary,
+                               embed_tokens,
+                               strategy=strategy)
        if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
            self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight
@@ -332,7 +351,7 @@ class S2TSATEEncoder(FairseqEncoder):
        if "ctc_logit" in acoustic_encoder_out and len(acoustic_encoder_out["ctc_logit"]) > 0:
            ctc_logit = acoustic_encoder_out["ctc_logit"][0]
-            ctc_prob = F.softmax(ctc_logit / self.temperature, dim=-1)
+            ctc_prob = F.softmax(ctc_logit / self.temperature, dim=-1, dtype=torch.float32)
        else:
            ctc_logit = None
            ctc_prob = None

--- a/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/fairseq/models/speech_to_text/s2t_transformer.py
@@ -2,6 +2,7 @@ import logging
 import math
 from typing import Dict, List, Optional, Tuple
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
@@ -13,7 +14,7 @@ from fairseq.models import (
    register_model,
    register_model_architecture,
 )
-from fairseq.models.speech_to_text.modules import InterAdapter, CTC
+from fairseq.models.speech_to_text.modules import Adapter, CTC
 from fairseq.models.transformer import Embedding, TransformerDecoder
 from fairseq.modules import (
    FairseqDropout,
@@ -388,6 +389,12 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
            type=int,
            help="cutoff of the distribution",
        )
+        parser.add_argument(
+            "--intermedia-drop-prob",
+            default=0,
+            type=float,
+            help="probability of dropping the followed layers",
+        )
        pass
    @classmethod
@@ -494,6 +501,7 @@ class S2TTransformerEncoder(FairseqEncoder):
        self.padding_idx = 1
        self.subsample = subsampling(args)
+        self.linear = nn.Linear(dim, dim)
        self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
@@ -573,8 +581,9 @@ class S2TTransformerEncoder(FairseqEncoder):
                strategy = getattr(args, "ctc_compress_strategy", None)
            elif args.intermedia_adapter == "league":
                strategy = getattr(args, "intermedia_distribution_cutoff", -1)
-            self.adapter = InterAdapter(dim, args.intermedia_adapter,
+            self.adapter = Adapter(dim, args.intermedia_adapter,
-                                        task.source_dictionary, strategy=strategy)
+                                   task.source_dictionary, strategy=strategy)
+            self.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
    @staticmethod
    def pooling_ratio():
@@ -631,6 +640,7 @@ class S2TTransformerEncoder(FairseqEncoder):
            x += positions
            positions = None
+        x = self.linear(x)
        x = self.dropout_module(x)
        # add emb into history
@@ -662,12 +672,17 @@ class S2TTransformerEncoder(FairseqEncoder):
            # interleave CTC
            if layer_idx in self.intermedia_ctc_layers:
+                if self.intermedia_drop_prob > 0:
+                    p = torch.rand(1).uniform_()
+                    if p < self.intermedia_drop_prob:
+                        break
                norm_x = self.layer_norm(x)
                logit = self.ctc(norm_x)
-                intermedia_ctc_logits.append(logit)
+                intermedia_ctc_logits.append(logit)
                # prob = self.ctc.softmax(norm_x)
-                prob = F.softmax(logit, dim=-1)
+                prob = utils.softmax(logit, dim=-1)
                x, encoder_padding_mask = self.adapter([x, prob], encoder_padding_mask)
            # gather cosine similarity
@@ -872,6 +887,7 @@ def base_architecture(args):
    # intermedia CTC
    args.intermedia_ctc_layers = getattr(args, "intermedia_ctc_layers", None)
    args.intermedia_adapter = getattr(args, "intermedia_adapter", None)
+    args.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)
 @register_model_architecture("s2t_transformer", "s2t_transformer_s")

--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -478,6 +478,7 @@ class TransformerEncoder(FairseqEncoder):
        self.embed_scale = 1.0 if args.no_scale_embedding else math.sqrt(embed_dim)
+        self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
        if self.attn_type == "rel_pos":
            self.embed_positions = RelPositionalEncoding(
                args.max_source_positions, args.encoder_embed_dim

--- a/fairseq/modules/dlcl.py
+++ b/fairseq/modules/dlcl.py
@@ -121,7 +121,7 @@ class DynamicLinearCombination(nn.Module):
                self.weight_mask = self.weight_mask.cuda()
            if self.normalize_learned_weight:
                weight = self.weight.masked_fill((self.weight_mask == 0).unsqueeze(2), float('-inf'))
-                self.normalized_weight = F.softmax(weight, dim=1)
+                self.normalized_weight = F.softmax(weight, dim=1, dtype=torch.float32)
            return
        # following layer

--- a/fairseq/modules/espnet_multihead_attention.py
+++ b/fairseq/modules/espnet_multihead_attention.py
@@ -9,6 +9,7 @@
 import math
 import torch
 from torch import nn
+import torch.nn.functional as F
 from fairseq.modules.rotary_positional_embedding import (
    RotaryPositionalEmbedding,
    apply_rotary_pos_emb,
@@ -73,10 +74,8 @@ class ESPNETMultiHeadedAttention(nn.Module):
                mask.unsqueeze(1).unsqueeze(2).to(bool),
                float("-inf"),  # (batch, head, time1, time2)
            )
-            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        self.attn = F.softmax(scores, dim=-1, dtype=torch.float32).type_as(scores)  # (batch, head, time1, time2)
+        # self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
-        else:
-            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
        p_attn = self.dropout(self.attn)
        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
        x = (
@@ -91,7 +90,7 @@ class ESPNETMultiHeadedAttention(nn.Module):
            query (torch.Tensor): Query tensor T X B X C
            key (torch.Tensor): Key tensor T X B X C
            value (torch.Tensor): Value tensor T X B X C
-            mask (torch.Tensor): Mask tensor T X B
+            key_padding_mask (torch.Tensor): Mask tensor T X B
        Returns:
            torch.Tensor: Output tensor T X B X D.
        """

--- a/fairseq/modules/multihead_attention.py
+++ b/fairseq/modules/multihead_attention.py
@@ -151,6 +151,7 @@ class MultiheadAttention(nn.Module):
        assert list(query.size()) == [tgt_len, bsz, embed_dim]
        if (
+            False and
            not self.onnx_trace
            and not is_tpu  # don't use PyTorch version on TPUs
            and incremental_state is None
@@ -349,9 +350,7 @@ class MultiheadAttention(nn.Module):
        if before_softmax:
            return attn_weights, v
-        attn_weights_float = utils.softmax(
+        attn_weights_float = F.softmax(attn_weights, dim=-1, dtype=torch.float32)
-            attn_weights, dim=-1, onnx_trace=self.onnx_trace
-        )
        attn_weights = attn_weights_float.type_as(attn_weights)
        attn_probs = self.dropout_module(attn_weights)

--- a/fairseq/modules/relative_multihead_attention.py
+++ b/fairseq/modules/relative_multihead_attention.py
@@ -205,6 +205,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
            # In this branch incremental_state is never None
            assert incremental_state is not None
            incremental_state = self._set_input_buffer(incremental_state, saved_state)
        assert k is not None
        src_len = k.size(1)
@@ -271,7 +272,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
                )
            else:
                attn_weights = attn_weights.transpose(0, 2)
-                attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf"))
+                attn_weights = attn_weights.float().masked_fill(key_padding_mask, float("-inf")).type_as(attn_weights)
                attn_weights = attn_weights.transpose(0, 2)
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
@@ -292,7 +293,6 @@ class RelativeMultiheadAttention(MultiheadAttention):
        else:
            attn = self._relative_attention_inner(attn_probs, v, relation_values, transpose=False)
-        # attn = torch.bmm(attn_probs, v)
        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        if self.onnx_trace and attn.size(1) == 1:
            # when ONNX tracing a single decoder step (sequence length == 1)
@@ -316,11 +316,11 @@ class RelativeMultiheadAttention(MultiheadAttention):
    def _generate_relative_positions_matrix(length, max_relative_length, device, incremental_state):
        if not incremental_state:
            # training process
-            range_vec = torch.arange(length).to(device)
+            range_vec = torch.arange(length, device=device)
            range_mat = range_vec.repeat(length, 1)
            distance_mat = range_mat - range_mat.transpose(0, 1)
        else:
-            distance_mat = torch.arange(-length + 1, 1).view(1, -1).to(device)
+            distance_mat = torch.arange(-length + 1, 0, device=device).view(1, -1)
        distance_mat_clipped = torch.clamp(distance_mat, -max_relative_length, max_relative_length)
@@ -337,7 +337,7 @@ class RelativeMultiheadAttention(MultiheadAttention):
        Args:
          x: Tensor with shape [batch_size*heads, length, length or depth].
-          y: Tensor with shap e [batch_size*heads, length, depth].
+          y: Tensor with shape [batch_size*heads, length, depth].
          z: Tensor with shape [length, length, depth].
          transpose: Whether to transpose inner matrices of y and z. Should be true if
              last dimension of x is depth, not length.

--- a/fairseq/modules/s2t_transformer_layer.py
+++ b/fairseq/modules/s2t_transformer_layer.py
@@ -161,7 +161,13 @@ class S2TTransformerEncoderLayer(nn.Module):
            else:
                print("The maximum encoder relative length %d can not be -1!" % max_relative_length)
                exit(1)
-        elif self.attn_type in ["rel_pos", "rel_pos_legacy"]:
+        elif self.attn_type == "rel_pos":
+            return RelPositionMultiHeadedAttention(
+                embed_dim,
+                attention_heads,
+                dropout=dropout,
+            )
+        elif self.attn_type == "rel_pos_legacy":
            return LegacyRelPositionMultiHeadedAttention(
                embed_dim,
                attention_heads,
@@ -236,8 +242,8 @@ class S2TTransformerEncoderLayer(nn.Module):
        # Note that we cannot use -inf here, because at some edge cases,
        # the attention weight (before softmax) for some padded element in query
        # will become -inf, which results in NaN in model parameters
-        # if attn_mask is not None:
+        if attn_mask is not None:
-        #     attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
        # whether to use macaron style
        if self.macaron_norm is not None:

--- a/fairseq/modules/speech_to_text/subsampling.py
+++ b/fairseq/modules/speech_to_text/subsampling.py
@@ -142,9 +142,6 @@ class Conv1dSubsampling(nn.Module):
            get_activation_class(act, dim=1)
        ) for layer_id in range(num_layers)])
-        out_dim = filters[-1]
-        self.linear = nn.Linear(out_dim, out_dim)
    def forward(self, x, x_len):
        # (B, T, D) -> (B, D, T)
@@ -157,7 +154,6 @@ class Conv1dSubsampling(nn.Module):
            if x_len is not None:
                x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
        x = x.transpose(1, 2)
-        x = self.linear(x)
        return x, x_len

--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -169,7 +169,7 @@ class TransformerEncoderLayer(nn.Module):
                `attn_mask[tgt_i, src_j] = 1` means that when calculating the
                embedding for `tgt_i`, we exclude (mask out) `src_j`. This is
                useful for strided self-attention.
-            positions (Tensor): the position embedding for relative position encoding
+            pos_emb (Tensor): the position embedding for relative position encoding
        Returns:
            encoded output of shape `(seq_len, batch, embed_dim)`
@@ -180,7 +180,9 @@ class TransformerEncoderLayer(nn.Module):
        # the attention weight (before softmax) for some padded element in query
        # will become -inf, which results in NaN in model parameters
        if attn_mask is not None:
-            attn_mask = attn_mask.masked_fill(attn_mask.to(torch.bool), -1e8)
+            attn_mask = attn_mask.masked_fill(
+                attn_mask.to(torch.bool), -1e8 if x.dtype == torch.float32 else -1e4
+            )
        residual = x
        if self.normalize_before: