add the shrink method into the sate architecture

a2353895 · xuchen · da4e7dc3 · a2353895 · a2353895 · a2353895
Commit a2353895 authored Dec 21, 2021 by xuchen
--- a/egs/libri_trans/asr/binary.sh
+++ b/egs/libri_trans/asr/binary.sh
+set -e
+eval=1
+lcrm=0
+tokenizer=0
+root_dir=~/st/Fairseq-S2T
+data_dir=~/st/data/test
+vocab_dir=~/st/data/mustc/st/en-de
+asr_vocab_prefix=spm_unigram10000_st_share
+src_lang=en
+tgt_lang=de
+subsets=(2019)
+cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
+rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
+splits=$(echo ${subsets[*]} | sed 's/ /,/g')
+cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
+    --data-root ${data_dir}
+    --output-root ${data_dir}
+    --splits ${splits}
+    --task asr
+    --src-lang ${src_lang}
+    --tgt-lang ${tgt_lang}
+    --add-src
+    --share
+    --asr-prefix ${asr_vocab_prefix}
+    --cmvn-type utterance"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="$cmd
+    --lowercase-src
+    --rm-punc-src"
+    fi
+    if [[ ${tokenizer} -eq 1 ]]; then
+        cmd="$cmd
+    --tokenizer"
+    fi
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/libri_trans/asr/conf/base.yaml
+++ b/egs/libri_trans/asr/conf/base.yaml
+arch: s2t_transformer_s
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+attention-dropout: 0.1
+activation-dropout: 0.1
--- a/egs/libri_trans/asr/conf/basis.yaml
+++ b/egs/libri_trans/asr/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/conformer.yaml
+++ b/egs/libri_trans/asr/conf/conformer.yaml
+macaron-style: True
+use-cnn-module: True
+cnn-module-kernel: 31
--- a/egs/libri_trans/asr/conf/ctc.yaml
+++ b/egs/libri_trans/asr/conf/ctc.yaml
+ctc-weight: 0.3
+post-process: sentencepiece
--- a/egs/libri_trans/asr/conf/debug.yaml
+++ b/egs/libri_trans/asr/conf/debug.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+ctc-weight: 0.3
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 2
+text-encoder-layers: 2
+decoder-layers: 2
+encoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
+#macaron-style: True
+#use-cnn-module: True
+#cnn-module-kernel: 31
+#acoustic-encoder: pds
+acoustic-encoder: transformer
+adapter: shrink
+encoder-embed-dim: 256
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/dlcl.yaml
+++ b/egs/libri_trans/asr/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/libri_trans/asr/conf/local_attn.yaml
+++ b/egs/libri_trans/asr/conf/local_attn.yaml
+encoder-attention-type: local
+hard-mask-window: 0
+gauss-mask-sigma: 3
+init-mask-weight: 0
\ No newline at end of file
--- a/egs/libri_trans/asr/conf/pds_base.yaml
+++ b/egs/libri_trans/asr/conf/pds_base.yaml
+arch: pdss2t_transformer_s_8
+pds-fusion: True
+ctc-layer: 12
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/pds_base_16.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_16.yaml
+arch: pdss2t_transformer_s_16
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/pds_base_32.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_32.yaml
+arch: pdss2t_transformer_s_32
+encoder-embed-dim: 256
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/pds_base_8.yaml
+++ b/egs/libri_trans/asr/conf/pds_base_8.yaml
+arch: pdss2t_transformer_s_8
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
--- a/egs/libri_trans/asr/conf/rpr.yaml
+++ b/egs/libri_trans/asr/conf/rpr.yaml
+encoder-attention-type: rel_selfattn
+#encoder-attention-type: relative
+#max-encoder-relative-length: 100
--- a/egs/libri_trans/asr/decode.sh
+++ b/egs/libri_trans/asr/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(tst-COMMON)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ ${#test_subset[@]} -ne 0 ]]; then
+    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
+    cmd="$cmd --test_subset ${subsets}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/libri_trans/asr/local/monitor.sh
+++ b/egs/libri_trans/asr/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/libri_trans/asr/local/parse_options.sh
+++ b/egs/libri_trans/asr/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/libri_trans/asr/local/utils.sh
+++ b/egs/libri_trans/asr/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/libri_trans/asr/run.sh
+++ b/egs/libri_trans/asr/run.sh
+#! /bin/bash
+# Processing MuST-C Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d_%H%M")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+#device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+tgt_lang=fr
+lang=${src_lang}-${tgt_lang}
+dataset=libri_trans
+task=speech_to_text
+vocab_type=unigram
+vocab_size=1000
+speed_perturb=1
+lcrm=1
+tokenizer=0
+use_raw_audio=1
+use_specific_dict=0
+specific_prefix=st
+specific_dir=${root_dir}/data/${dataset}/st
+asr_vocab_prefix=spm_unigram1000_st_share
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
+train_split=train
+valid_split=dev
+test_split=test
+test_subset=test
+# exp
+exp_prefix=$(date "+%m%d")
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=base,ctc
+data_config=config.yaml
+# training setting
+fp16=1
+max_tokens=40000
+step_valid=0
+# decoding setting
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+if [[ ${speed_perturb} -eq 1 ]]; then
+    data_dir=${data_dir}_sp
+    exp_prefix=${exp_prefix}_sp
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    data_dir=${data_dir}_${specific_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+if [[ ${use_raw_audio} -eq 1 ]]; then
+    data_dir=${data_dir}_raw
+    exp_prefix=${exp_prefix}_raw
+fi
+. ./local/parse_options.sh || exit 1;
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize Kaldi recipes in most cases
+    echo "stage 0: ASR Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    feature_zip=fbank80.zip
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
+    fi
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
+        ln -s ${data_dir}/../feature_zip ${data_dir}
+    fi
+    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
+        --data-root ${org_data_dir}
+        --output-root ${data_dir}
+        --task asr
+        --src-lang ${src_lang}
+        --splits ${valid_split},${test_split},${train_split}
+        --vocab-type ${vocab_type}
+        --vocab-size ${vocab_size}"
+    if [[ ${use_raw_audio} -eq 1 ]]; then
+        cmd="$cmd
+        --raw"
+    fi
+    if [[ ${use_specific_dict} -eq 1 ]]; then
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+        cmd="$cmd
+        --asr-prefix ${asr_vocab_prefix}"
+    fi
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        cmd="$cmd
+        --speed-perturb"
+    fi
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="$cmd
+        --lowercase-src
+        --rm-punc-src"
+    fi
+    if [[ ${tokenizer} -eq 1 ]]; then
+        cmd="$cmd
+        --tokenizer"
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
+    fi
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: ASR Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        keep_last_epochs=10
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    else
+        validate_interval=1
+        keep_last_epochs=10
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $keep_last_epochs ]]; then
+        cmd="${cmd}
+        --keep-last-epochs $keep_last_epochs "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+	fi
+fi
+wait
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: ASR Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=${test_subset//,/ }
+	for subset in ${test_subset[@]}; do
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --gen-subset ${subset}
+        --task speech_to_text
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}
+        --scoring wer
+        --wer-tokenizer 13a
+        --wer-lowercase
+        --wer-remove-punct
+        "
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/libri_trans/asr/train.sh
+++ b/egs/libri_trans/asr/train.sh
+#! /bin/bash
+# training the model
+gpu_num=2
+update_freq=1
+max_tokens=40000
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+exp_tag=
+#config_list=(base)
+#config_list=(ctc)
+#config_list=(base conformer)
+#config_list=(pds_base_16)
+config_list=(pds_base_8 conformer rpr)
+# exp full name
+exp_name=
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/libri_trans/mt/binary.sh
+++ b/egs/libri_trans/mt/binary.sh
+set -e
+eval=1
+lcrm=0
+root_dir=~/st/Fairseq-S2T
+data_dir=/home/xuchen/st/data/wmt/test
+vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
+src_vocab_prefix=spm_unigram32000_share
+tgt_vocab_prefix=spm_unigram32000_share
+src_lang=en
+tgt_lang=de
+tokenize=1
+splits=(newstest2014 newstest2016)
+for split in ${splits[@]}; do
+    src_file=${data_dir}/${split}.${src_lang}
+    tgt_file=${data_dir}/${split}.${tgt_lang}
+    if [[ ${tokenize} -eq 1 ]]; then
+        cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        src_file=${src_file}.tok
+        tgt_file=${tgt_file}.tok
+    fi
+    cmd="cat ${src_file}"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="python local/lower_rm.py ${src_file}"
+    fi
+    cmd="${cmd}
+    | spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
+    --output_format=piece
+    > ${src_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="spm_encode
+    --model ${vocab_dir}/${tgt_vocab_prefix}.model
+    --output_format=piece
+    < ${tgt_file}
+    > ${tgt_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    src_file=${src_file}.spm
+    tgt_file=${tgt_file}.spm
+    mkdir -p ${data_dir}/final
+    cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+done
+n_set=${#splits[*]}
+for ((i=0;i<$n_set;i++)); do
+    dataset[$i]=${data_dir}/final/${splits[$i]}
+done
+pref=`echo ${dataset[*]} | sed 's/ /,/g'`
+cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    --source-lang ${src_lang}
+    --target-lang ${tgt_lang}
+    --testpref ${pref}
+    --destdir ${data_dir}/data-bin
+    --srcdict ${vocab_dir}/${src_vocab_prefix}.txt
+    --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
+    --workers 64"
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
--- a/egs/libri_trans/mt/conf/base.yaml
+++ b/egs/libri_trans/mt/conf/base.yaml
+arch: transformer
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
--- a/egs/libri_trans/mt/conf/base_s.yaml
+++ b/egs/libri_trans/mt/conf/base_s.yaml
+arch: transformer
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 1e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
--- a/egs/libri_trans/mt/conf/basis.yaml
+++ b/egs/libri_trans/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+max-epoch: 50
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/libri_trans/mt/conf/dlcl.yaml
+++ b/egs/libri_trans/mt/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/libri_trans/mt/conf/rpr.yaml
+++ b/egs/libri_trans/mt/conf/rpr.yaml
+#encoder-attention-type: rel_selfattn
+encoder-attention-type: relative
+decoder-attention-type: relative
+max-encoder-relative-length: 20
+max-decoder-relative-length: 20
--- a/egs/libri_trans/mt/decode.sh
+++ b/egs/libri_trans/mt/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(test)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+sacrebleu=1
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ -n ${test_subset} ]]; then
+    test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
+    cmd="$cmd --test_subset ${test_subset}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/libri_trans/mt/local/lower_rm.py
+++ b/egs/libri_trans/mt/local/lower_rm.py
+import sys
+import string
+in_file = sys.argv[1]
+with open(in_file, "r", encoding="utf-8") as f:
+    for line in f.readlines():
+        line = line.strip().lower()
+        for w in string.punctuation:
+            line = line.replace(w, "")
+        line = line.replace("  ", "")
+        print(line)
--- a/egs/libri_trans/mt/local/monitor.sh
+++ b/egs/libri_trans/mt/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/libri_trans/mt/local/parse_options.sh
+++ b/egs/libri_trans/mt/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/libri_trans/mt/local/utils.sh
+++ b/egs/libri_trans/mt/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/libri_trans/mt/run.sh
+++ b/egs/libri_trans/mt/run.sh
--- a/egs/libri_trans/mt/train.sh
+++ b/egs/libri_trans/mt/train.sh
+#! /bin/bash
+# training the model
+gpu_num=1
+update_freq=1
+max_tokens=8192
+exp_tag=baseline
+config_list=(base)
+# exp full name
+exp_name=
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/libri_trans/st/binary.sh
+++ b/egs/libri_trans/st/binary.sh
+set -e
+eval=1
+lcrm=1
+tokenizer=0
+root_dir=~/st/Fairseq-S2T
+data_dir=/home/xuchen/st/data/test
+vocab_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
+asr_vocab_prefix=spm_unigram10000_st_share
+st_vocab_prefix=spm_unigram10000_st_share
+src_lang=en
+tgt_lang=de
+splits=(2019)
+splits=$(echo ${splits[*]} | sed 's/ /_/g')
+cp -r ${vocab_dir}/${asr_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
+cp -r ${vocab_dir}/${st_vocab_prefix}.* ${data_dir}/${src_lang}-${tgt_lang}
+rm -rf ${data_dir}/${src_lang}-${tgt_lang}/fbank80.zip
+cmd="python ${root_dir}/examples/speech_to_text/prep_st_data.py
+    --data-root ${data_dir}
+    --output-root ${data_dir}
+    --splits ${splits}
+    --task st
+    --src-lang ${src_lang}
+    --tgt-lang ${tgt_lang}
+    --add-src
+    --share
+    --asr-prefix ${asr_vocab_prefix}
+    --st-spm-prefix ${st_vocab_prefix}
+    --cmvn-type utterance"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="$cmd
+    --lowercase-src
+    --rm-punc-src"
+    fi
+    if [[ ${tokenizer} -eq 1 ]]; then
+        cmd="$cmd
+    --tokenizer"
+    fi
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/libri_trans/st/conf/base.yaml
+++ b/egs/libri_trans/st/conf/base.yaml
+arch: s2t_transformer_s
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+attention-dropout: 0.1
+activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/st/conf/basis.yaml
+++ b/egs/libri_trans/st/conf/basis.yaml
+train-subset: train
+valid-subset: dev
+max-epoch: 100
+max-update: 100000
+patience: 20
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 10
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/libri_trans/st/conf/conformer.yaml
+++ b/egs/libri_trans/st/conf/conformer.yaml
+macaron-style: True
+use-cnn-module: True
+cnn-module-kernel: 31
--- a/egs/libri_trans/st/conf/ctc.yaml
+++ b/egs/libri_trans/st/conf/ctc.yaml
+ctc-weight: 0.3
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/libri_trans/st/conf/dlcl.yaml
+++ b/egs/libri_trans/st/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/libri_trans/st/conf/local_attn.yaml
+++ b/egs/libri_trans/st/conf/local_attn.yaml
+encoder-attention-type: local
+hard-mask-window: 0
+gauss-mask-sigma: 3
+init-mask-weight: 0
\ No newline at end of file
--- a/egs/libri_trans/st/conf/pds_base.yaml
+++ b/egs/libri_trans/st/conf/pds_base.yaml
+arch: pdss2t_transformer_s_8
+pds-fusion: True
+ctc-layer: 12
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/st/conf/pds_base_16.yaml
+++ b/egs/libri_trans/st/conf/pds_base_16.yaml
+arch: pdss2t_transformer_s_16
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/st/conf/pds_base_32.yaml
+++ b/egs/libri_trans/st/conf/pds_base_32.yaml
+arch: pdss2t_transformer_s_32
+encoder-embed-dim: 256
+pds-stages: 5
+ctc-layer: 12
+pds-layers: 2_2_3_3_2
+pds-ratios: 2_2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1_1
+pds-kernel-sizes: 5_5_5_5_5
+pds-ffn-ratios: 8_8_8_8_8
+pds-attn-heads: 4_4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/st/conf/pds_base_8.yaml
+++ b/egs/libri_trans/st/conf/pds_base_8.yaml
+arch: pdss2t_transformer_s_8
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/libri_trans/st/conf/rpr.yaml
+++ b/egs/libri_trans/st/conf/rpr.yaml
+encoder-attention-type: rel_selfattn
+#encoder-attention-type: relative
+#decoder-attention-type: relative
+#max-encoder-relative-length: 100
+#max-decoder-relative-length: 20
--- a/egs/libri_trans/st/conf/sate_ctc.yaml
+++ b/egs/libri_trans/st/conf/sate_ctc.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+ctc-weight: 0.3
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
+#macaron-style: True
+#use-cnn-module: True
+#cnn-module-kernel: 31
+#acoustic-encoder: pds
+acoustic-encoder: transformer
+adapter: league
+encoder-embed-dim: 256
+pds-stages: 4
+#pds-dropout: 0
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
\ No newline at end of file
--- a/egs/libri_trans/st/decode.sh
+++ b/egs/libri_trans/st/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(dev tst-COMMON)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+sacrebleu=1
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ ${#test_subset[@]} -ne 0 ]]; then
+    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
+    cmd="$cmd --test_subset ${subsets}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/libri_trans/st/ensemble.sh
+++ b/egs/libri_trans/st/ensemble.sh
+set -e
+gpu_num=1
+root_dir=/home/xuchen/st/Fairseq-S2T
+ckpt=/home/xuchen/st/checkpoints/mustc-v2/st
+model_txt=$1
+set=$2
+test_subset=$3
+#data_dir=/home/xuchen/st/data/mustc-v2/st_lcrm/en-de
+#test_subset=(tst-COMMON)
+data_dir=/media/data/tst/$set/en-de
+#test_subset=(office)
+#test_subset=(webrtc1)
+#test_subset=(adap2)
+data_config=config_st_share.yaml
+result_file=./result
+beam_size=5
+lenpen=0.6
+max_tokens=10000
+models=()
+i=0
+for line in `cat $model_txt`; do
+    i=`expr $i + 1`
+    model_dir=$ckpt/$line
+    [[ ! -d $model_dir ]] && echo $model_dir && exit 1;
+    if [[ -f $model_dir/avg_10_checkpoint.pt ]]; then
+        model=$model_dir/avg_10_checkpoint.pt
+    else
+        model=$model_dir/checkpoint_best.pt
+    fi
+    [[ ! -f $model ]] && echo $model && exit 1;
+    models[$i]=$model
+done
+models=`echo ${models[*]} | sed 's/ /:/g'`
+res_dir=$ckpt/ensemble/$set
+i=0
+while : 
+do
+    if [[ -d $res_dir/$i ]]; then
+        i=`expr $i + 1`
+    else
+        res_dir=$res_dir/$i
+        break
+    fi 
+done
+mkdir -p $res_dir
+cp $model_txt $res_dir
+if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+    if [[ ${gpu_num} -eq 0 ]]; then
+        device=()
+    else
+        source ./local/utils.sh
+        device=$(get_devices $gpu_num 0)
+    fi
+fi
+export CUDA_VISIBLE_DEVICES=${device}
+for subset in ${test_subset[@]}; do
+    subset=${subset}_st
+    cmd="python ${root_dir}/fairseq_cli/generate.py
+    ${data_dir}
+    --config-yaml ${data_config}
+    --gen-subset ${subset}
+    --task speech_to_text
+    --path ${models}
+    --results-path ${res_dir}
+    --skip-invalid-size-inputs-valid-test
+    --max-tokens ${max_tokens}
+    --beam ${beam_size}
+    --lenpen ${lenpen}
+    --scoring sacrebleu"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    eval $cmd
+    tail -n 1 ${res_dir}/generate-${subset}.txt
+    cd $res_dir
+    evaluate.sh translation-${subset}.txt $set
+    cd -
+done
--- a/egs/libri_trans/st/local/monitor.sh
+++ b/egs/libri_trans/st/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/libri_trans/st/local/parse_options.sh
+++ b/egs/libri_trans/st/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/libri_trans/st/local/utils.sh
+++ b/egs/libri_trans/st/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/libri_trans/st/run.sh
+++ b/egs/libri_trans/st/run.sh
--- a/egs/libri_trans/st/train.sh
+++ b/egs/libri_trans/st/train.sh
+#! /bin/bash
+# training the model
+gpu_num=8
+update_freq=1
+max_tokens=40000
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+exp_tag=
+#config_list=(base)
+config_list=(ctc)
+#config_list=(sate_ctc)
+#config_list=(ctc conformer rpr)
+#config_list=(base sate)
+#config_list=(pds_base)
+#config_list=(pds_base conformer)
+# exp full name
+exp_name=
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -62,7 +62,7 @@ exp_tag=baseline
 exp_name=
 # config
-train_config=ctc
+train_config=base
 data_config=config.yaml
 # training setting

--- a/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -107,6 +107,9 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
        ).contiguous()  # (T, B, C) from the encoder
        lprobs.batch_first = False
+        if "ctc_padding_mask" in encoder_out:
+            non_padding_mask = ~encoder_out["ctc_padding_mask"][0]
+        else:
            non_padding_mask = ~encoder_out["encoder_padding_mask"][0]
        input_lengths = non_padding_mask.long().sum(-1)

--- a/fairseq/models/speech_to_text/ctc.py
+++ b/fairseq/models/speech_to_text/ctc.py
@@ -4,6 +4,7 @@ import logging
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from fairseq.modules import (
    FairseqDropout,
    LayerNorm,
@@ -50,3 +51,46 @@ class CTC(nn.Module):
    def argmax(ctc_logit):
        return torch.argmax(ctc_logit, dim=-1)
+class CTCCompressStrategy:
+    @staticmethod
+    def avg(prob_ctc, predicted, new_lengths, dtype, device):
+        new_maxlen = max(new_lengths)
+        weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype)
+        for b_idx, pred in enumerate(predicted):
+            processed_inputs_cnt = 0
+            for t_idx, same in enumerate(pred):
+                new_processed_inputs_cnt = processed_inputs_cnt + same[1]
+                weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = 1.0 / same[1]
+                processed_inputs_cnt = new_processed_inputs_cnt
+        return weights_matrix.to(device)
+    @staticmethod
+    def weighted(prob_ctc, predicted, new_lengths, dtype, device):
+        new_maxlen = max(new_lengths)
+        weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype, device=device)
+        for b_idx, pred in enumerate(predicted):
+            processed_inputs_cnt = 0
+            for t_idx, same in enumerate(pred):
+                new_processed_inputs_cnt = processed_inputs_cnt + same[1]
+                # Get the probabilities of the prediction for the different time steps as weight
+                weights = prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]]
+                weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
+                    weights / weights.sum()
+                processed_inputs_cnt = new_processed_inputs_cnt
+        return weights_matrix
+    @staticmethod
+    def softmax(prob_ctc, predicted, new_lengths, dtype, device):
+        new_maxlen = max(new_lengths)
+        weights_matrix = torch.zeros((prob_ctc.shape[0], prob_ctc.shape[1], new_maxlen), dtype=dtype, device=device)
+        for b_idx, pred in enumerate(predicted):
+            processed_inputs_cnt = 0
+            for t_idx, same in enumerate(pred):
+                new_processed_inputs_cnt = processed_inputs_cnt + same[1]
+                # Get the probabilities of the prediction for the different time steps as weight
+                weights = F.softmax(prob_ctc[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, same[0]])
+                weights_matrix[b_idx, processed_inputs_cnt:new_processed_inputs_cnt, t_idx] = \
+                    weights / weights.sum()
+                processed_inputs_cnt = new_processed_inputs_cnt
+        return weights_matrix
--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -19,6 +19,7 @@ from fairseq.models.speech_to_text import (
    S2TConformerEncoder,
    PDSS2TTransformerModel,
    PDSS2TTransformerEncoder,
+    CTCCompressStrategy
 )
 from fairseq.models.speech_to_text.s2t_transformer import Conv1dSubsampler
 from fairseq.modules import (
@@ -68,6 +69,12 @@ class S2TSATEModel(S2TTransformerModel):
            help="adapter type",
        )
        parser.add_argument(
+            "--ctc-compress-strategy",
+            default="avg",
+            type=str,
+            help="compress strategy, such as avg, weighted, and softmax",
+        )
+        parser.add_argument(
            "--share-ctc-and-adapter",
            default=False,
            action="store_true",
@@ -143,7 +150,6 @@ class Adapter(nn.Module):
        embed_dim = args.encoder_embed_dim
        self.adapter_type = args.adapter
        if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]:
            self.linear_adapter = nn.Sequential(
                nn.Linear(embed_dim, embed_dim),
@@ -175,10 +181,14 @@ class Adapter(nn.Module):
            self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
            self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
+        if self.adapter_type == "shrink":
+            self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
    def forward(self, x, padding):
        representation, distribution = x
        batch, seq_len, embed_dim = representation.size()
+        org_distribution = distribution
        if distribution is not None:
            distribution = distribution.view(-1, distribution.size(-1))
        lengths = (~padding).long().sum(-1)
@@ -208,6 +218,27 @@ class Adapter(nn.Module):
        elif self.adapter_type == "none":
            out = representation
+        elif self.adapter_type == "shrink":
+            from itertools import groupby
+            with torch.no_grad():
+                batch_predicted = []
+                prob_ctc = org_distribution.transpose(0, 1)  # T x B x D -> B x T x D
+                for b in range(prob_ctc.shape[0]):
+                    predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist()
+                    batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
+                new_lengths = [len(p) for p in batch_predicted]
+                weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths,
+                                                          representation.dtype, representation.device)
+            # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
+            compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix)  # B x C x T'
+            out = compressed_output.permute(2, 0, 1)
+            out_lengths = lengths.new(new_lengths)
+            padding = lengths_to_padding_mask(out_lengths)
        else:
            out = None
            logging.error("Unsupported adapter type: {}.".format(self.adapter_type))
@@ -290,13 +321,6 @@ class S2TSATEEncoder(FairseqEncoder):
        if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):
            self.acoustic_encoder.ctc.ctc_projection.weight = self.adapter.embed_adapter.weight
-        # self.length_adapter = Conv1dSubsampler(
-        #     args.encoder_embed_dim,
-        #     args.conv_channels,
-        #     args.encoder_embed_dim,
-        #     [int(k) for k in args.conv_kernel_sizes.split(",")],
-        # )
        acoustic_encoder_attention_type = args.encoder_attention_type
        args.encoder_attention_type = args.text_attention_type
@@ -321,6 +345,7 @@ class S2TSATEEncoder(FairseqEncoder):
        encoder_out = acoustic_encoder_out["encoder_out"][0]
        encoder_padding_mask = acoustic_encoder_out["encoder_padding_mask"][0]
+        ctc_padding_mask = encoder_padding_mask
        if "ctc_logit" in acoustic_encoder_out and len(acoustic_encoder_out["ctc_logit"]) > 0:
            ctc_logit = acoustic_encoder_out["ctc_logit"][0]
@@ -343,16 +368,12 @@ class S2TSATEEncoder(FairseqEncoder):
            self.history.add(x)
-        # src_lengths = (~encoder_padding_mask).sum(1)
-        # x = x.transpose(0, 1)
-        # x, input_lengths = self.length_adapter(x, src_lengths)
-        # encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        x = self.text_encoder(x, encoder_padding_mask, self.history)
        return {
            "encoder_out": [x],  # T x B x C
            "ctc_logit": [ctc_logit],    # T x B x C
+            "ctc_padding_mask": [ctc_padding_mask], # B x T
            "encoder_padding_mask": [encoder_padding_mask],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
@@ -452,6 +473,7 @@ def base_architecture(args):
    # SATE
    args.acoustic_encoder = getattr(args, "acoustic_encoder", "transformer")
    args.adapter = getattr(args, "adapter", "league")
+    args.ctc_compress_strategy = getattr(args, "ctc_compress_strategy", "avg")
    args.temperature = getattr(args, "temperature", 1.0)
    args.text_encoder_layers = getattr(args, "text_encoder_layers", 6)
    args.text_attention_type = getattr(args, "text_attention_type", "selfattn")