optimize the shell scripts for iwslt2022 En-Zh, implement the method of the Efficient Conformer

0bd92062 · xuchen · 55702466 · 0bd92062 · 0bd92062 · 0bd92062
Commit 0bd92062 authored Feb 28, 2022 by xuchen
--- a/egs/aishell/asr/run.sh
+++ b/egs/aishell/asr/run.sh
@@ -129,8 +129,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py

--- a/egs/iwslt2022/asr/binary.sh
+++ b/egs/iwslt2022/asr/binary.sh
@@ -23,10 +23,10 @@ asr_vocab_prefix=spm_unigram10000_st_share

 src_lang=en
 tgt_lang=zh
-subsets=(train_covost)
+subsets=(train_covost train_eu train_iwslt train_mustc_ende train_voxpopuil train_mustc_enzh dev tst-COMMON)

 mkdir -p $data_dir
-splits=$(echo ${subsets[*]} | sed 's/ /_/g')
+splits=$(echo ${subsets[*]} | sed 's/ /,/g')
 cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
    --data-root ${org_data_dir}
    --output-root ${data_dir}

--- a/egs/iwslt2022/asr/conf/basis.yaml
+++ b/egs/iwslt2022/asr/conf/basis.yaml
-train-subset: train
+#train-subset: train_covost,train_eu,train_iwslt,train_mustc_ende,train_voxpopuil,train_mustc_enzh
+train-subset: train_mustc_enzh
 valid-subset: dev

 max-epoch: 100
-max-update: 100000
+max-update: 1000000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False

--- a/egs/iwslt2022/asr/conf/pds_base_8.yaml
+++ b/egs/iwslt2022/asr/conf/pds_base_8.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_8

 encoder-embed-dim: 256
 pds-stages: 4
-ctc-layer: 12
+#ctc-layer: 12
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True

--- a/egs/iwslt2022/asr/run.sh
+++ b/egs/iwslt2022/asr/run.sh
@@ -30,10 +30,10 @@ pwd_dir=$PWD

 # dataset
 src_lang=en
-tgt_lang=de
+tgt_lang=zh
 lang=${src_lang}-${tgt_lang}

-dataset=mustc
+dataset=iwslt2022
 task=speech_to_text
 vocab_type=unigram
 vocab_size=5000
@@ -42,7 +42,7 @@ lcrm=0
 tokenizer=0
 use_raw_audio=0

-use_specific_dict=1
+use_specific_dict=0
 specific_prefix=st
 specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share
@@ -125,8 +125,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py

--- a/egs/iwslt2022/asr/train.sh
+++ b/egs/iwslt2022/asr/train.sh
@@ -4,7 +4,7 @@

 gpu_num=8
 update_freq=1
-max_tokens=40000
+max_tokens=80000

 extra_tag=
 extra_parameter=
@@ -13,11 +13,11 @@ extra_parameter=

 exp_tag=

-config_list=(base ctc)
-config_list=(purectc)
+#config_list=(base ctc)
+#config_list=(purectc)
 #config_list=(base conformer)

-#config_list=(pds_base_16)
+config_list=(pds_base_8 ctc)
 #config_list=(pds_base_16 conformer rpr)

 # exp full name

--- a/egs/iwslt2022/mt/run.sh
+++ b/egs/iwslt2022/mt/run.sh
@@ -30,15 +30,17 @@ pwd_dir=$PWD

 # dataset
 src_lang=en
-tgt_lang=de
+tgt_lang=zh
 lang=${src_lang}-${tgt_lang}

-dataset=mustc
+dataset=iwslt2022
 task=translation
-vocab_type=unigram
-vocab_size=10000
-share_dict=1
-lcrm=0
+src_vocab_type=unigram
+tgt_vocab_type=unigram
+src_vocab_size=32000
+tgt_vocab_size=32000
+share_dict=0
+lcrm=1
 tokenizer=0

 use_specific_dict=1
@@ -49,7 +51,7 @@ tgt_vocab_prefix=spm_unigram10000_st_share

 org_data_dir=${root_dir}/data/${dataset}
 data_dir=${root_dir}/data/${dataset}/mt
-train_subset=train
+train_subset=train_mustc_enzh
 valid_subset=dev
 trans_subset=tst-COMMON
 test_subset=test
@@ -82,15 +84,23 @@ if [[ ${use_specific_dict} -eq 1 ]]; then
    data_dir=${data_dir}/${specific_prefix}
    mkdir -p ${data_dir}
 else
-    if [[ "${vocab_type}" == "char" ]]; then
-        vocab_name=${vocab_type}
-        exp_prefix=${exp_prefix}_${vocab_type}
+    if [[ "${tgt_vocab_type}" == "char" ]]; then
+        vocab_name=char
+        exp_prefix=${exp_prefix}_char
    else
-        vocab_name=${vocab_type}${vocab_size}
+        if [[ ${src_vocab_size} -ne ${tgt_vocab_size} || "${src_vocab_type}" -ne "${tgt_vocab_type}" ]]; then
+            src_vocab_name=${src_vocab_type}${src_vocab_size}
+            tgt_vocab_name=${tgt_vocab_type}${tgt_vocab_size}
+            vocab_name=${src_vocab_name}_${tgt_vocab_name}
+        else
+            vocab_name=${tgt_vocab_type}${tgt_vocab_size}
+            src_vocab_name=${vocab_name}
+            tgt_vocab_name=${vocab_name}
+        fi
    fi
    data_dir=${data_dir}/${vocab_name}
-    src_vocab_prefix=spm_${vocab_name}_${src_lang}
-    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
+    src_vocab_prefix=spm_${src_vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${tgt_vocab_name}_${tgt_lang}
    if [[ $share_dict -eq 1 ]]; then
        data_dir=${data_dir}_share
        src_vocab_prefix=spm_${vocab_name}_share
@@ -141,8 +151,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
                --splits ${train_subset},${valid_subset},${trans_subset}
                --src-lang ${src_lang}
                --tgt-lang ${tgt_lang}
-                --vocab-type ${vocab_type}
-                --vocab-size ${vocab_size}"
+                --src-vocab-type ${src_vocab_type}
+                --tgt-vocab-type ${tgt_vocab_type}
+                --src-vocab-size ${src_vocab_size}
+                --tgt-vocab-size ${tgt_vocab_size}"
            if [[ $share_dict -eq 1 ]]; then
                cmd="$cmd
                --share"

--- a/egs/iwslt2022/st/conf/basis.yaml
+++ b/egs/iwslt2022/st/conf/basis.yaml
-train-subset: train
+#train-subset: train_mustc_enzh,train_covost
+train-subset: train_mustc_enzh
 valid-subset: dev

 max-epoch: 100

--- a/egs/iwslt2022/st/conf/sate_pds.yaml
+++ b/egs/iwslt2022/st/conf/sate_pds.yaml
@@ -29,7 +29,7 @@ acoustic-encoder: pds
 adapter: league

 encoder-embed-dim: 256
-ctc-layer: 12
+#ctc-layer: 12
 pds-stages: 4
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2

--- a/egs/iwslt2022/st/decode.sh
+++ b/egs/iwslt2022/st/decode.sh
@@ -10,8 +10,8 @@ if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi

-sacrebleu=1
-n_average=10
+sacrebleu=0
+n_average=1
 beam_size=5
 len_penalty=1.0
 max_tokens=80000

--- a/egs/iwslt2022/st/run.sh
+++ b/egs/iwslt2022/st/run.sh
@@ -30,29 +30,29 @@ pwd_dir=$PWD

 # dataset
 src_lang=en
-tgt_lang=de
+tgt_lang=zh
 lang=${src_lang}-${tgt_lang}

-dataset=mustc
+dataset=iwslt2022
 task=speech_to_text
 vocab_type=unigram
 asr_vocab_size=5000
 vocab_size=10000
-share_dict=1
+share_dict=0
 speed_perturb=0
-lcrm=0
+lcrm=1
 tokenizer=0
 use_raw_audio=0

-use_specific_dict=0
-specific_prefix=valid
-specific_dir=${root_dir}/data/mustc/st
-asr_vocab_prefix=spm_unigram10000_st_share
-st_vocab_prefix=spm_unigram10000_st_share
+use_specific_dict=1
+specific_prefix=asr
+specific_dir=${root_dir}/data/${dataset}/asr
+asr_vocab_prefix=spm_unigram5000_asr
+st_vocab_prefix=

 org_data_dir=${root_dir}/data/${dataset}
 data_dir=${root_dir}/data/${dataset}/st
-train_split=train
+train_split=train_mustc_enzh
 valid_split=dev
 test_split=tst-COMMON
 test_subset=tst-COMMON
@@ -133,8 +133,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    # create ASR vocabulary if necessary
@@ -147,8 +147,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --splits ${valid_split},${test_split},${train_split}
        --vocab-type ${vocab_type}
        --vocab-size ${asr_vocab_size}"
-    [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]] && (echo -e "\033[34mRun command: \n${cmd} \033[0m" && eval $cmd)
-    asr_prefix=spm_${vocab_type}${asr_vocab_size}_asr
+    if [[ $eval -eq 1 && ${share_dict} -ne 1 && ${use_specific_dict} -ne 1 ]]; then
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        eval $cmd
+        asr_vocab_prefix=spm_${vocab_type}${asr_vocab_size}_asr
+        cp ${data_dir}/asr4st/${asr_vocab_prefix}* ${data_dir}
+    fi

    echo "stage 0: ST Data Preparation"
    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
@@ -167,25 +171,21 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        cmd="$cmd
        --raw"
    fi
-    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
-        cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
    if [[ $share_dict -eq 1 ]]; then
            cmd="$cmd
-        --share
-        --st-spm-prefix ${st_vocab_prefix}"
+        --share"
    else
            cmd="$cmd
-        --st-spm-prefix ${st_vocab_prefix}
        --asr-prefix ${asr_vocab_prefix}"
    fi
-    else
-        if [[ $share_dict -eq 1 ]]; then
-            cmd="$cmd
-        --share"
-        else
+    if [[ ${use_specific_dict} -eq 1 ]]; then
+        if [[ ${share_dict} -eq 0 && -n ${asr_vocab_prefix} ]]; then
+            cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
+        fi
+        if [[ -n ${st_vocab_prefix} ]]; then
+            cp -r ${specific_dir}/${st_vocab_prefix}.* ${data_dir}
            cmd="$cmd
-        --asr-prefix ${asr_prefix}"
+        --st-spm-prefix ${st_vocab_prefix}"
        fi
    fi
    if [[ ${speed_perturb} -eq 1 ]]; then

--- a/egs/iwslt2022/st/train.sh
+++ b/egs/iwslt2022/st/train.sh
@@ -14,13 +14,13 @@ extra_parameter=
 exp_tag=

 #config_list=(base)
-config_list=(ctc)
-#config_list=(sate_ctc)
+#config_list=(sate ctc)
 #config_list=(ctc conformer rpr)
 #config_list=(base sate)

-#config_list=(pds_base)
-#config_list=(pds_base conformer)
+config_list=(sate_pds ctc)
+#config_list=(pds_base_8)
+#config_list=(pds_base_8 conformer)

 # exp full name
 exp_name=

--- a/egs/libri_trans/asr/conf/debug.yaml
+++ b/egs/libri_trans/asr/conf/debug.yaml
@@ -2,6 +2,17 @@ arch: s2t_ctc
 encoder-type: pds
 #arch: pdss2t_transformer_s_8

+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 1
+
+#encoder-attention-type: transfer
+#relative-pos-enc: True
+
+encoder-attention-type: rel_pos
+#pds-attn-ds-ratios: 4_2_1_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
 encoder-embed-dim: 256
 pds-stages: 4
 ctc-layer: 12

--- a/egs/libri_trans/asr/run.sh
+++ b/egs/libri_trans/asr/run.sh
 #! /bin/bash

-# Processing MuST-C Datasets
+# Processing LibriSpeech En-Fr Datasets

 # Copyright 2021 Natural Language Processing Laboratory 
 # Xu Chen (xuchenneu@163.com)
@@ -124,8 +124,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py

--- a/egs/libri_trans/asr/train.sh
+++ b/egs/libri_trans/asr/train.sh
@@ -6,7 +6,6 @@ gpu_num=2
 update_freq=1
 max_tokens=40000

-
 extra_tag=
 extra_parameter=
 #extra_tag="${extra_tag}"
@@ -15,10 +14,9 @@ extra_parameter=
 exp_tag=

 #config_list=(base)
-#config_list=(ctc)
 #config_list=(base conformer)

-#config_list=(pds_base_16)
+#config_list=(pds_base_8)
 config_list=(pds_base_8 conformer rpr)

 # exp full name

--- a/egs/libri_trans/mt/conf/rpr.yaml
+++ b/egs/libri_trans/mt/conf/rpr.yaml
-#encoder-attention-type: rel_selfattn
 encoder-attention-type: relative
 decoder-attention-type: relative
-max-encoder-relative-length: 20
-max-decoder-relative-length: 20
+max-encoder-relative-length: 8
+max-decoder-relative-length: 8
--- a/egs/libri_trans/mt/conf/base_s.yaml
+++ b/egs/libri_trans/mt/conf/base_s.yaml
--- a/egs/librispeech/asr/conf/purectc_pds_base_16.yaml
+++ b/egs/librispeech/asr/conf/purectc_pds_base_16.yaml
+arch: s2t_ctc
+encoder-type: pds
+
+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 8_4_2_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
+
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 2_2_6_2
+pds-ratios: 2_2_2_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+
+criterion: ctc
+
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/librispeech/asr/conf/purectc_pds_base_8.yaml
+++ b/egs/librispeech/asr/conf/purectc_pds_base_8.yaml
+arch: s2t_ctc
+encoder-type: pds
+
+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 4_2_1_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
+
+encoder-embed-dim: 256
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 256_256_256_256
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 8_8_8_8
+pds-attn-heads: 4_4_4_4
+
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+adam_betas: (0.9,0.98)
+
+criterion: ctc
+
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/librispeech/asr/conf/purectc_pds_base_8_compare.yaml
+++ b/egs/librispeech/asr/conf/purectc_pds_base_8_compare.yaml
+arch: s2t_ctc
+encoder-type: pds
+
+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 4_2_1_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
+
+encoder-embed-dim: 176
+pds-stages: 4
+ctc-layer: 16
+pds-layers: 4_4_4_4
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 176_176_176_176
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 4_4_4_4
+
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 0.0015
+adam_betas: (0.9,0.98)
+
+criterion: ctc
+post-process: sentencepiece
+
+dropout: 0.1
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+
+macaron-style: True
+use-cnn-module: True
+cnn-module-kernel: 31
+encoder-activation-fn: swish
+encoder-attention-type: rel_pos
+
+#load-pretrained-encoder-from:
--- a/egs/librispeech/asr/train.sh
+++ b/egs/librispeech/asr/train.sh
@@ -11,12 +11,12 @@ extra_parameter=
 #extra_tag="${extra_tag}"
 #extra_parameter="${extra_parameter} "

-#exp_tag=
+exp_tag=
 #config_list=(base)
-#config_list=(ctc)
-#config_list=(ctc conformer rpr)
-config_list=(base conformer rpr)
+#config_list=(base conformer)
+#config_list=(ConformerCTCSmall)

+config_list=(purectc_pds_base_16)
 #config_list=(pds_base)
 #config_list=(pds_big)
 #config_list=(pds_deep)

--- a/egs/mustc/asr/conf/pds_base_8.yaml
+++ b/egs/mustc/asr/conf/pds_base_8.yaml
 arch: pdss2t_transformer_s_8

+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 0.1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 4_2_2_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
+
 encoder-embed-dim: 256
 pds-stages: 4
-ctc-layer: 12
+#ctc-layer: 12
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True

--- a/egs/mustc/asr/conf/purectc_pds_base_8.yaml
+++ b/egs/mustc/asr/conf/purectc_pds_base_8.yaml
 arch: s2t_ctc
 encoder-type: pds

+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 8_4_2_1
+#attention-reduced-method: pool
+#attention-reduced-q: True
+
 encoder-embed-dim: 256
 pds-stages: 4
-ctc-layer: 12
+#ctc-layer: 12
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True
@@ -26,17 +35,12 @@ lr: 2e-3
 adam_betas: (0.9,0.98)

 criterion: ctc
+post-process: sentencepiece

 dropout: 0.1
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12
-decoder-layers: 6
 encoder-attention-heads: 4

-decoder-embed-dim: 256
-decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
-
 #load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/asr/conf/rpr.yaml
+++ b/egs/mustc/asr/conf/rpr.yaml
-encoder-attention-type: rel_selfattn
+encoder-attention-type: rel_pos
 #encoder-attention-type: relative
 #max-encoder-relative-length: 100
--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -125,8 +125,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py

--- a/egs/mustc/asr/train.sh
+++ b/egs/mustc/asr/train.sh
@@ -17,8 +17,8 @@ config_list=(base ctc)
 config_list=(purectc)
 #config_list=(base conformer)

-#config_list=(pds_base_16)
-#config_list=(pds_base_16 conformer rpr)
+config_list=(pds_base_8)
+config_list=(purectc_pds_base_8)

 # exp full name
 exp_name=

--- a/egs/mustc/mt/train.sh
+++ b/egs/mustc/mt/train.sh
@@ -7,7 +7,7 @@ update_freq=1
 max_tokens=8192

 exp_tag=baseline
-config_list=(base)
+config_list=(small)

 # exp full name
 exp_name=

--- a/egs/mustc/st/conf/inter.yaml
+++ b/egs/mustc/st/conf/inter.yaml
@@ -2,5 +2,6 @@ ctc-weight: 0.2
 intermedia-ctc-layers: 6,9
 intermedia-adapter: league
 intermedia-ctc-weight: 0.1
+#intermedia-drop-prob: 0.2
 ctc-self-distill-weight: 0
 post-process: sentencepiece
\ No newline at end of file
--- a/egs/mustc/st/conf/pds_base_8.yaml
+++ b/egs/mustc/st/conf/pds_base_8.yaml
 arch: pdss2t_transformer_s_8

-pds-ctc: 1_1_1_1
-intermedia-adapter: league
-intermedia-ctc-weight: 0.15
+#pds-ctc: 0_1_1_0
+#intermedia-adapter: league
+#intermedia-ctc-weight: 0.1
+
+#encoder-attention-type: reduced
+#pds-attn-ds-ratios: 4_2_1_1
+#attention-reduced-method: pool

 encoder-embed-dim: 256
 pds-stages: 4
-ctc-layer: 12
+#ctc-layer: 12
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
-pds-fusion: True
+#pds-fusion: True
 pds-fusion-method: all_conv
 pds-embed-dims: 256_256_256_256
 pds-ds-method: conv

--- a/egs/mustc/st/conf/rpr.yaml
+++ b/egs/mustc/st/conf/rpr.yaml
-encoder-attention-type: rel_selfattn
+encoder-attention-type: rel_pos
+#encoder-attention-type: rel_pos_legacy
+#encoder-attention-type: rel_selfattn
 #encoder-attention-type: relative
 #decoder-attention-type: relative
 #max-encoder-relative-length: 100

--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -133,8 +133,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ${speed_perturb} -eq 1 ]]; then
        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../feature_zip ${data_dir}
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../${feature_zip} ]]; then
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
    fi

    # create ASR vocabulary if necessary

--- a/egs/mustc/st/train.sh
+++ b/egs/mustc/st/train.sh
@@ -14,13 +14,12 @@ extra_parameter=
 exp_tag=

 #config_list=(base)
-config_list=(ctc)
-#config_list=(sate_ctc)
-#config_list=(ctc conformer rpr)
-#config_list=(base sate)
+#config_list=(base ctc conformer)
+#config_list=(sate ctc)

-#config_list=(pds_base)
+#config_list=(pds_base_8)
 #config_list=(pds_base conformer)
+#config_list=(sate_pds ctc)

 # exp full name
 exp_name=

--- a/egs/wmt16/mt/run.sh
+++ b/egs/wmt16/mt/run.sh
@@ -327,14 +327,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    export CUDA_VISIBLE_DEVICES=${device}

    log=${model_dir}/train.log
-    if [[ -e ${log} ]]; then
-        for i in `seq 1 100`; do
-            if [ ! -e ${log}.${i} ]; then
-                log=${log}.${i}
-                break
-            fi
-        done
-    fi

    cmd="nohup ${cmd} >> ${log} 2>&1 &"
    if [[ $eval -eq 1 ]]; then

--- a/egs/wmt20/mt/binary.sh
+++ b/egs/wmt20/mt/binary.sh
+set -e
+
+eval=1
+lcrm=0
+
+src_lang=en
+tgt_lang=zh
+tokenize=1
+splits=(tst-COMMON test11)
+
+dataset=wmt20
+root_dir=~/st/Fairseq-S2T
+data_dir=/home/xuchen/st/data/$dataset/data
+vocab_dir=/home/xuchen/st/data/$dataset/mt/unigram32000_tok
+dest_dir=$vocab_dir
+src_vocab_prefix=spm_unigram32000_en
+tgt_vocab_prefix=spm_unigram32000_zh
+
+for split in ${splits[@]}; do
+    src_file=${data_dir}/${split}/${split}.${src_lang}
+    tgt_file=${data_dir}/${split}/${split}.${tgt_lang}
+
+    if [[ ${tokenize} -eq 1 ]]; then
+        src_tok_file=${data_dir}/${split}.tok/${split}.tok.${src_lang}
+        tgt_tok_file=${data_dir}/${split}.tok/${split}.tok.${tgt_lang}
+        if [[ ! -f ${src_tok_file} ]]; then
+            cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_tok_file}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval ${cmd}
+        fi
+
+        if [[ ! -f ${tgt_tok_file} ]]; then
+            cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_tok_file}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval ${cmd}
+        fi
+        src_file=${src_tok_file}
+        tgt_file=${tgt_tok_file}
+    fi
+
+    cmd="cat ${src_file}"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="python local/lower_rm.py ${src_file}"
+    fi
+    cmd="${cmd}
+    | spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
+    --output_format=piece
+    > ${src_file}.spm"
+
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+
+    cmd="spm_encode
+    --model ${vocab_dir}/${tgt_vocab_prefix}.model
+    --output_format=piece
+    < ${tgt_file}
+    > ${tgt_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+
+    src_file=${src_file}.spm
+    tgt_file=${tgt_file}.spm
+
+    mkdir -p ${dest_dir}/final
+    cmd="cp ${src_file} ${dest_dir}/final/${split}.${src_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+
+    cmd="cp ${tgt_file} ${dest_dir}/final/${split}.${tgt_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+done
+
+n_set=${#splits[*]}
+for ((i=0;i<$n_set;i++)); do
+    dataset[$i]=${dest_dir}/final/${splits[$i]}
+done
+pref=`echo ${dataset[*]} | sed 's/ /,/g'`
+
+cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    --source-lang ${src_lang}
+    --target-lang ${tgt_lang}
+    --testpref ${pref}
+    --destdir ${dest_dir}/data-bin
+    --srcdict ${vocab_dir}/${src_vocab_prefix}.txt
+    --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
+    --workers 64"
+
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
--- a/egs/wmt20/mt/conf/base.yaml
+++ b/egs/wmt20/mt/conf/base.yaml
+arch: transformer
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 2e-3
+adam_betas: (0.9,0.997)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt20/mt/conf/base_postnorm.yaml
+++ b/egs/wmt20/mt/conf/base_postnorm.yaml
+arch: transformer
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 4000
+lr: 7e-4
+adam_betas: (0.9,0.98)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+
+activation-fn: relu
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt20/mt/conf/basis.yaml
+++ b/egs/wmt20/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+
+max-epoch: 20
+max-update: 100000
+patience: 5
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 5
+
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
+
+max-source-positions: 512
--- a/egs/wmt20/mt/conf/big.yaml
+++ b/egs/wmt20/mt/conf/big.yaml
+arch: transformer_wmt_en_de_big_t2t
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 7e-4
+adam_betas: (0.9,0.997)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.3
+attention-dropout: 0.1
+activation-dropout: 0.1
+
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 1024
+encoder-ffn-embed-dim: 4096
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 16
+
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt20/mt/conf/big_postnorm.yaml
+++ b/egs/wmt20/mt/conf/big_postnorm.yaml
+arch: transformer_wmt_en_de_big
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 4000
+lr: 5e-4
+adam_betas: (0.9,0.98)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.3
+attention-dropout: 0.1
+activation-dropout: 0.1
+
+activation-fn: relu
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-embed-dim: 1024
+encoder-ffn-embed-dim: 4096
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 16
+
+decoder-embed-dim: 1024
+decoder-ffn-embed-dim: 4096
+decoder-attention-heads: 16
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt20/mt/conf/deep.yaml
+++ b/egs/wmt20/mt/conf/deep.yaml
+arch: transformer
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 16000
+lr: 2e-3
+adam_betas: (0.9,0.997)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 30
+decoder-layers: 6
+encoder-attention-heads: 8
+
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt20/mt/conf/dlcl.yaml
+++ b/egs/wmt20/mt/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/wmt20/mt/conf/rpr.yaml
+++ b/egs/wmt20/mt/conf/rpr.yaml
+encoder-attention-type: relative
+decoder-attention-type: relative
+max-encoder-relative-length: 8
+max-decoder-relative-length: 8
--- a/egs/wmt20/mt/decode.sh
+++ b/egs/wmt20/mt/decode.sh
+#! /bin/bash
+
+gpu_num=1
+
+data_dir=
+test_subset=(test)
+
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+
+sacrebleu=0
+n_average=5
+beam_size=4
+len_penalty=0.6
+max_tokens=80000
+dec_model=checkpoint_best.pt
+
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ -n ${test_subset} ]]; then
+    test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
+    cmd="$cmd --test_subset ${test_subset}"
+fi
+
+echo $cmd
+eval $cmd
--- a/egs/wmt20/mt/local/detokenizer.perl
+++ b/egs/wmt20/mt/local/detokenizer.perl
+#!/usr/bin/env perl
+
+# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use warnings;
+use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $UPPERCASE_SENT = 0;
+my $PENN = 0;
+
+while (@ARGV) {
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-u$/ && ($UPPERCASE_SENT = 1, next);
+  /^-penn$/ && ($PENN = 1, next);
+}
+
+if ($HELP) {
+	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+        print "Options:\n";
+        print "  -u     ... uppercase the first char in the final sentence.\n";
+        print "  -q     ... don't report detokenizer revision.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -penn  ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
+	exit;
+}
+
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
+  print STDERR "Warning: No built-in rules for language $language.\n"
+}
+
+if ($PENN && $language ne "en") {
+  print STDERR "Error: -penn option only supported for English text.\n";
+  exit;
+}
+
+if (!$QUIET) {
+	print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
+	print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to detokenize XML/HTML tag lines
+		print $_;
+  } elsif ($PENN) {
+    print &detokenize_penn($_);
+  } else {
+		print &detokenize($_);
+	}
+}
+
+
+sub ucsecondarg {
+  # uppercase the second argument
+  my $arg1 = shift;
+  my $arg2 = shift;
+  return $arg1.uc($arg2);
+}
+
+sub deescape {
+  # de-escape special chars
+  my ($text) = @_;
+  $text =~ s/\&bar;/\|/g;   # factor separator (legacy)
+  $text =~ s/\&#124;/\|/g;  # factor separator
+  $text =~ s/\&lt;/\</g;    # xml
+  $text =~ s/\&gt;/\>/g;    # xml
+  $text =~ s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&quot;/\"/g;  # xml
+  $text =~ s/\&apos;/\'/g;  # xml
+  $text =~ s/\&#91;/\[/g;   # syntax non-terminal
+  $text =~ s/\&#93;/\]/g;   # syntax non-terminal
+  $text =~ s/\&amp;/\&/g;   # escape escape
+  return $text;
+}
+
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text = &deescape($text);
+
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	my %quoteCount =  ("\'"=>0,"\""=>0);
+	my $prependSpace = " ";
+	for ($i=0;$i<(scalar(@words));$i++) {		
+		if (&startsWithCJKChar($words[$i])) {
+		    if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) {
+			# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+			$text=$text.$words[$i];
+		    } else {
+			# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+			$text=$text.$prependSpace.$words[$i];
+		    }
+		    $prependSpace = " ";
+		} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+			#perform right shift on currency and other random punctuation items
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+		    if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
+			#these punctuations are prefixed with a non-breakable space in french
+			$text .= " "; }
+			#perform left shift on punctuation items
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+			#left-shift the contraction for English
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+			#left-shift floats in Czech
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+			#right-shift the contraction for French and Italian
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+				&& ($words[$i] =~ /[\p{IsAlpha}]$/)
+				&& ($words[$i+1] =~ /^[-–]$/)
+				&& ($words[$i+2] =~ /^li$|^mail.*/i)
+				) {
+			#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+			$text = $text.$prependSpace.$words[$i].$words[$i+1];
+			$i++; # advance over the dash
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
+			#combine punctuation smartly
+                        my $normalized_quo = $words[$i];
+                        $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+                        $quoteCount{$normalized_quo} = 0
+                                if !defined $quoteCount{$normalized_quo};
+                        if ($language eq "cs" && $words[$i] eq "„") {
+                          # this is always the starting quote in Czech
+                          $quoteCount{$normalized_quo} = 0;
+                        }
+                        if ($language eq "cs" && $words[$i] eq "“") {
+                          # this is usually the ending quote in Czech
+                          $quoteCount{$normalized_quo} = 1;
+                        }
+			if (($quoteCount{$normalized_quo} % 2) eq 0) {
+				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+					#single quote for posesssives ending in s... "The Jones' house"
+					#left shift
+					$text=$text.$words[$i];
+					$prependSpace = " ";
+				} else {
+					#right shift
+					$text = $text.$prependSpace.$words[$i];
+					$prependSpace = "";
+					$quoteCount{$normalized_quo} ++;
+
+				}
+			} else {
+				#left shift
+				$text=$text.$words[$i];
+				$prependSpace = " ";
+				$quoteCount{$normalized_quo} ++;
+
+			}
+			
+        } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+            # Finnish : without intervening space if followed by case suffix
+            # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+            $text=$text. lc $words[$i];
+            $prependSpace = " ";
+		} else {
+			$text=$text.$prependSpace.$words[$i];
+			$prependSpace = " ";
+		}
+	}
+	
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+
+        $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
+	return $text;
+}
+
+sub detokenize_penn {
+  my($text) = @_;
+
+  chomp($text);
+  $text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text =~ s/ \@\/\@ /\//g;
+  $text = &deescape($text);
+
+  # merge de-contracted forms except where the second word begins with an
+  # apostrophe (those are handled later)
+  $text =~ s/ n't /n't /g;
+  $text =~ s/ N'T /N'T /g;
+  $text =~ s/ ([Cc])an not / $1annot /g;
+  $text =~ s/ ([Dd])' ye / $1'ye /g;
+  $text =~ s/ ([Gg])im me / $1imme /g;
+  $text =~ s/ ([Gg])on na / $1onna /g;
+  $text =~ s/ ([Gg])ot ta / $1otta /g;
+  $text =~ s/ ([Ll])em me / $1emme /g;
+  $text =~ s/ '([Tt]) is / '$1is /g;
+  $text =~ s/ '([Tt]) was / '$1was /g;
+  $text =~ s/ ([Ww])an na / $1anna /g;
+
+  # restore brackets
+  $text =~ s/-LRB-/\(/g;
+  $text =~ s/-RRB-/\)/g;
+  $text =~ s/-LSB-/\[/g;
+  $text =~ s/-RSB-/\]/g;
+  $text =~ s/-LCB-/{/g;
+  $text =~ s/-RCB-/}/g;
+
+  my $i;
+  my @words = split(/ /,$text);
+  $text = "";
+  my $prependSpace = " ";
+  for ($i=0;$i<(scalar(@words));$i++) {
+    if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+      # perform right shift on currency and other random punctuation items
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = "";
+    } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+      # perform left shift on punctuation items
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+      # left-shift the contraction
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
+      # opening single quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\'";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "``") {
+      # opening double quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\"";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "\'") {
+      # closing single quote: convert to straight quote and left shift
+      $text = $text."\'";
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "\'\'") {
+      # closing double quote: convert to straight quote and left shift
+      $text = $text."\"";
+      $prependSpace = " ";
+    } else {
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = " ";
+    }
+  }
+
+  # clean up spaces at head and tail of each line as well as any double-spacing
+  $text =~ s/ +/ /g;
+  $text =~ s/\n /\n/g;
+  $text =~ s/ \n/\n/g;
+  $text =~ s/^ //g;
+  $text =~ s/ $//g;
+
+  # add trailing break
+  $text .= "\n" unless $text =~ /\n$/;
+
+  $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
+  return $text;
+}
+
+sub startsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $firstChar = substr($str, 0, 1);
+    return &charIsCJK($firstChar);
+}
+
+sub endsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $lastChar = substr($str, length($str)-1, 1);
+    return &charIsCJK($lastChar);
+}
+
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+    my ($char) = @_;
+    # $char should be a string of length 1
+    my $codepoint = &codepoint_dec($char);
+
+    # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+    # Hangul Jamo (1100–11FF)
+    return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+
+    # Phags-pa (A840–A87F)
+    return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+
+    # Hangul Syllables (AC00–D7AF)
+    return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+
+    # CJK Compatibility Ideographs (F900–FAFF)
+    return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+
+    # CJK Compatibility Forms (FE30–FE4F)
+    return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+
+    # Supplementary Ideographic Plane 20000–2FFFF
+    return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+
+    return 0;
+}
+
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+    if (my $char = shift) {
+	return unpack('U0U*', $char);
+    }
+}
+
+sub between_hexes {
+    my ($num, $left, $right) = @_;
+    return $num >= hex($left) && $num <= hex($right);
+}
--- a/egs/wmt20/mt/local/lower_rm.py
+++ b/egs/wmt20/mt/local/lower_rm.py
+import sys
+import string
+
+
+in_file = sys.argv[1]
+
+with open(in_file, "r", encoding="utf-8") as f:
+    for line in f.readlines():
+        line = line.strip().lower()
+        for w in string.punctuation:
+            line = line.replace(w, "")
+        line = line.replace("  ", "")
+        print(line)
+
--- a/egs/wmt20/mt/local/monitor.sh
+++ b/egs/wmt20/mt/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/wmt20/mt/local/multi-bleu.perl
+++ b/egs/wmt20/mt/local/multi-bleu.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+# $Id$
+use warnings;
+use strict;
+
+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+
+my $stem = $ARGV[0];
+if (!defined $stem) {
+  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
+  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
+  exit(1);
+}
+
+$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
+
+my @REF;
+my $ref=0;
+while(-e "$stem$ref") {
+    &add_to_ref("$stem$ref",\@REF);
+    $ref++;
+}
+&add_to_ref($stem,\@REF) if -e $stem;
+die("ERROR: could not find reference file $stem") unless scalar @REF;
+
+# add additional references explicitly specified on the command line
+shift;
+foreach my $stem (@ARGV) {
+    &add_to_ref($stem,\@REF) if -e $stem;
+}
+
+
+
+sub add_to_ref {
+    my ($file,$REF) = @_;
+    my $s=0;
+    if ($file =~ /.gz$/) {
+	open(REF,"gzip -dc $file|") or die "Can't read $file";
+    } else { 
+	open(REF,$file) or die "Can't read $file";
+    }
+    while(<REF>) {
+	chop;
+	push @{$$REF[$s++]}, $_;
+    }
+    close(REF);
+}
+
+my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+my $s=0;
+while(<STDIN>) {
+    chop;
+    $_ = lc if $lowercase;
+    my @WORD = split;
+    my %REF_NGRAM = ();
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF[$s]}) {
+#      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
+	my @WORD = split(' ',$reference);
+	my $length = scalar(@WORD);
+        my $diff = abs($length_translation_this_sentence-$length);
+	if ($diff < $closest_diff) {
+	    $closest_diff = $diff;
+	    $closest_length = $length;
+	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
+	} elsif ($diff == $closest_diff) {
+            $closest_length = $length if $length < $closest_length;
+            # from two references with the same closeness to me
+            # take the *shorter* into account, not the "first" one.
+        }
+	for(my $n=1;$n<=4;$n++) {
+	    my %REF_NGRAM_N = ();
+	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+		my $ngram = "$n";
+		for(my $w=0;$w<$n;$w++) {
+		    $ngram .= " ".$WORD[$start+$w];
+		}
+		$REF_NGRAM_N{$ngram}++;
+	    }
+	    foreach my $ngram (keys %REF_NGRAM_N) {
+		if (!defined($REF_NGRAM{$ngram}) ||
+		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+	}
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+	my %T_NGRAM = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	    my $ngram = "$n";
+	    for(my $w=0;$w<$n;$w++) {
+		$ngram .= " ".$WORD[$start+$w];
+	    }
+	    $T_NGRAM{$ngram}++;
+	}
+	foreach my $ngram (keys %T_NGRAM) {
+	    $ngram =~ /^(\d+) /;
+	    my $n = $1;
+            # my $corr = 0;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	    $TOTAL[$n] += $T_NGRAM{$ngram};
+	    if (defined($REF_NGRAM{$ngram})) {
+		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+		    $CORRECT[$n] += $T_NGRAM{$ngram};
+                    # $corr =  $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+		}
+		else {
+		    $CORRECT[$n] += $REF_NGRAM{$ngram};
+                    # $corr =  $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
+            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
+	}
+    }
+    $s++;
+}
+my $brevity_penalty = 1;
+my $bleu = 0;
+
+my @bleu=();
+
+for(my $n=1;$n<=4;$n++) {
+  if (defined ($TOTAL[$n])){
+    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
+    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
+  }else{
+    $bleu[$n]=0;
+  }
+}
+
+if ($length_reference==0){
+  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
+  exit(1);
+}
+
+if ($length_translation<$length_reference) {
+  $brevity_penalty = exp(1-$length_reference/$length_translation);
+}
+$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
+				my_log( $bleu[2] ) +
+				my_log( $bleu[3] ) +
+				my_log( $bleu[4] ) ) / 4) ;
+printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
+    100*$bleu,
+    100*$bleu[1],
+    100*$bleu[2],
+    100*$bleu[3],
+    100*$bleu[4],
+    $brevity_penalty,
+    $length_translation / $length_reference,
+    $length_translation,
+    $length_reference;
+
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}
--- a/egs/wmt20/mt/local/parse_options.sh
+++ b/egs/wmt20/mt/local/parse_options.sh
+#!/usr/bin/env bash
+
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+
+
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+
+
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+
+
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+
+
+true; # so this script returns exit code 0.
--- a/egs/wmt20/mt/local/replace-unicode-punctuation.perl
+++ b/egs/wmt20/mt/local/replace-unicode-punctuation.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+use strict;
+
+#binmode(STDIN, ":utf8");
+#binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  s/，/,/g;
+  s/。 */. /g;
+  s/、/,/g;
+  s/”/"/g;
+  s/“/"/g;
+  s/∶/:/g;
+  s/：/:/g;
+  s/？/\?/g;
+  s/《/"/g;
+  s/》/"/g;
+  s/）/\)/g;
+  s/！/\!/g;
+  s/（/\(/g;
+  s/；/;/g;
+  s/１/"/g;
+  s/」/"/g;
+  s/「/"/g;
+  s/０/0/g;
+  s/３/3/g;
+  s/２/2/g;
+  s/５/5/g;
+  s/６/6/g;
+  s/９/9/g;
+  s/７/7/g;
+  s/８/8/g;
+  s/４/4/g;
+  s/． */. /g;
+  s/～/\~/g;
+  s/’/\'/g;
+  s/…/\.\.\./g;
+  s/━/\-/g;
+  s/〈/\</g;
+  s/〉/\>/g;
+  s/【/\[/g;
+  s/】/\]/g;
+  s/％/\%/g;
+  print $_;
+}
--- a/egs/wmt20/mt/local/tokenizer.perl
+++ b/egs/wmt20/mt/local/tokenizer.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+
+use warnings;
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+
+my $mydir = "$RealBin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+
+    chomp($text);
+    $text = " $text ";
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # separate out all "other" special characters
+    if (($language eq "fi") or ($language eq "sv")) {
+        # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
+        # USA:n, 20:een, EU:ssa, USA:s, S:t
+        $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
+        # if a colon is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    else {
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    }
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+			}
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+
+    my($text) = @_;
+    chomp($text);
+
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+
+    $text =~ s=--= -- =g;
+
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
--- a/egs/wmt20/mt/local/utils.sh
+++ b/egs/wmt20/mt/local/utils.sh
+
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
+
+
--- a/egs/wmt20/mt/local/wmt_en2de_multi_bleu.sh
+++ b/egs/wmt20/mt/local/wmt_en2de_multi_bleu.sh
+#! /bin/bash
+
+# calculate wmt14 en-de multi-bleu score
+
+if [ $# -ne 1 ]; then
+    echo "usage: $0 GENERATE_PY_OUTPUT"
+    exit 1
+fi
+echo -e "\n RUN >> "$0
+
+requirement_scripts=(detokenizer.perl replace-unicode-punctuation.perl tokenizer.perl multi-bleu.perl)
+for script in ${requirement_scripts[@]}; do
+    if ! which ${script} > /dev/null; then
+        echo "Error: it seems that moses is not installed or exported int the environment variables." >&2
+        return 1
+    fi
+done
+
+detokenizer=detokenizer.perl
+replace_unicode_punctuation=replace-unicode-punctuation.perl
+tokenizer=tokenizer.perl
+multi_bleu=multi-bleu.perl
+
+GEN=$1
+SYS=$GEN.sys
+REF=$GEN.ref
+
+cat $GEN | cut -f 3 > $REF
+cat $GEN | cut -f 4 > $SYS
+
+#detokenize the decodes file to format the manner to do tokenize
+$detokenizer -l de < $SYS > $SYS.dtk
+$detokenizer -l de < $REF > $REF.dtk
+
+#replace unicode
+$replace_unicode_punctuation -l de < $SYS.dtk > $SYS.dtk.punc
+$replace_unicode_punctuation -l de < $REF.dtk > $REF.dtk.punc
+
+#tokenize the decodes file by moses tokenizer.perl
+$tokenizer -l de < $SYS.dtk.punc > $SYS.dtk.punc.tok
+$tokenizer -l de < $REF.dtk.punc > $REF.dtk.punc.tok
+
+#"rich-text format" --> rich ##AT##-##AT## text format.
+perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $SYS.dtk.punc.tok > $SYS.dtk.punc.tok.atat
+perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $REF.dtk.punc.tok > $REF.dtk.punc.tok.atat
+
+$multi_bleu $REF.dtk.punc.tok.atat < $SYS.dtk.punc.tok.atat
+
+rm -f $SYS.dtk $SYS.dtk.punc $SYS.dtk.punc.tok $REF.dtk $REF.dtk.punc $REF.dtk.punc.tok
\ No newline at end of file
--- a/egs/wmt20/mt/run.sh
+++ b/egs/wmt20/mt/run.sh
+#! /bin/bash
+
+# Processing WMT16 En-De Datasets
+
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+
+eval=1
+time=$(date "+%m%d")
+
+stage=0
+stop_stage=0
+
+######## hardware ########
+# devices
+device=()
+gpu_num=8
+update_freq=1
+
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+
+# dataset
+src_lang=en
+tgt_lang=zh
+lang=${src_lang}-${tgt_lang}
+
+dataset=wmt20
+task=translation
+vocab_type=unigram
+vocab_size=32000
+share_dict=0
+lcrm=1
+tokenizer=1
+
+use_specific_dict=0
+subword=0
+specific_prefix=subword32000_share
+specific_dir=${root_dir}/data/mustc/st
+src_vocab_prefix=spm_unigram10000_st_share
+tgt_vocab_prefix=spm_unigram10000_st_share
+
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/mt
+train_subset=train
+valid_subset=dev
+trans_subset=tst-COMMON
+test_subset=test
+
+# exp
+exp_prefix=${time}
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+
+# config
+train_config=base_s
+
+# training setting
+fp16=1
+max_tokens=4096
+step_valid=0
+bleu_valid=0
+
+# decoding setting
+sacrebleu=0
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    exp_prefix=${exp_prefix}_${specific_prefix}
+    data_dir=${data_dir}/${specific_prefix}
+    mkdir -p ${data_dir}
+else
+    if [[ "${vocab_type}" == "char" ]]; then
+        vocab_name=${vocab_type}
+        exp_prefix=${exp_prefix}_${vocab_type}
+    else
+        vocab_name=${vocab_type}${vocab_size}
+    fi
+    data_dir=${data_dir}/${vocab_name}
+    src_vocab_prefix=spm_${vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
+    if [[ $share_dict -eq 1 ]]; then
+        data_dir=${data_dir}_share
+        src_vocab_prefix=spm_${vocab_name}_share
+        tgt_vocab_prefix=spm_${vocab_name}_share
+    fi
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+
+. ./local/parse_options.sh || exit 1;
+
+# full path
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    echo "stage 0: MT Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+
+    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
+        if [[ ${use_specific_dict} -eq 0 ]]; then
+            cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
+                --data-root ${org_data_dir}
+                --output-root ${data_dir}
+                --splits ${train_subset},${valid_subset},${trans_subset}
+                --src-lang ${src_lang}
+                --tgt-lang ${tgt_lang}
+                --vocab-type ${vocab_type}
+                --vocab-size ${vocab_size}"
+            if [[ $share_dict -eq 1 ]]; then
+                cmd="$cmd
+                --share"
+            fi
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="$cmd
+                --tokenizer"
+            fi
+
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval ${cmd}
+        else
+            cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
+            cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
+        fi
+    fi
+
+    mkdir -p ${data_dir}/data
+    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
+    {
+        if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
+            text_dir=${org_data_dir}/data/${split}/txt
+        else
+            text_dir=${org_data_dir}/data/${split}
+        fi
+        src_text=${text_dir}/${split}.${src_lang}
+        tgt_text=${text_dir}/${split}.${tgt_lang}
+        if [[ ${tokenizer} -eq 1 ]]; then
+            src_text=${text_dir}/${split}.tok.${src_lang}
+            tgt_text=${text_dir}/${split}.tok.${tgt_lang}
+        fi
+        cmd="cat ${src_text}"
+        if [[ ${lcrm} -eq 1 ]]; then
+            cmd="python local/lower_rm.py ${src_text}"
+        fi
+        cmd="${cmd}
+        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
+        --output_format=piece
+        > ${data_dir}/data/${split}.${src_lang}"
+
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+
+        cmd="spm_encode
+        --model ${data_dir}/${tgt_vocab_prefix}.model
+        --output_format=piece
+        < ${tgt_text}
+        > ${data_dir}/data/${split}.${tgt_lang}"
+
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+    }&
+    done
+    wait
+
+    cmd="python ${code_dir}/fairseq_cli/preprocess.py
+        --source-lang ${src_lang} --target-lang ${tgt_lang}
+        --trainpref ${data_dir}/data/${train_subset}
+        --validpref ${data_dir}/data/${valid_subset}
+        --testpref ${data_dir}/data/${trans_subset}
+        --destdir ${data_dir}/data-bin
+        --srcdict ${data_dir}/${src_vocab_prefix}.txt
+        --tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
+        --workers 64"
+
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+fi
+
+data_dir=${data_dir}/data-bin
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: MT Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    fi
+    if [[ $bleu_valid -eq 1 ]]; then
+        cmd="$cmd
+        --eval-bleu
+        --eval-bleu-args '{\"beam\": 1}'
+        --eval-tokenized-bleu
+        --eval-bleu-remove-bpe
+        --best-checkpoint-metric bleu
+        --maximize-best-checkpoint-metric"
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+
+    log=${model_dir}/train.log
+
+    cmd="nohup ${cmd} >> ${log} 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
+	fi
+wait
+echo -e " >> finish training \n"
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: MT Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+
+    test_subset=(${test_subset//,/ })
+	for subset in ${test_subset[@]}; do
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --gen-subset ${subset}
+        --task ${task}
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}"
+
+        if [[ ${subword} -eq 1 ]]; then
+            cmd="${cmd}
+        --post-process subword_nmt"
+        else
+            cmd="${cmd}
+        --post-process sentencepiece"
+        fi
+
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="${cmd}
+        --tokenizer moses
+        --moses-source-lang ${src_lang}
+        --moses-target-lang ${tgt_lang}"
+            fi
+        fi
+
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/wmt20/mt/train.sh
+++ b/egs/wmt20/mt/train.sh
+#! /bin/bash
+
+# training the model
+
+gpu_num=8
+update_freq=2
+max_tokens=8192
+
+exp_tag=baseline
+#config_list=(base)
+config_list=(deep)
+
+# exp full name
+exp_name=
+
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+
+echo ${cmd}
+eval ${cmd}
--- a/examples/speech_to_text/prep_audio_data.py
+++ b/examples/speech_to_text/prep_audio_data.py
@@ -112,7 +112,7 @@ class AudioDataset(Dataset):
        if self.mode == "easy":
            real_idx = 0
            for idx, v in segments.items():
-                audio_name = v["audio"]
+                audio_name = f"{split}_{v['audio']}"
                v["audio"] = (wav_root / v["audio"].strip()).as_posix() + ".wav"
                if self.speed_perturb is not None:
                    for perturb in self.speed_perturb:
@@ -137,8 +137,8 @@ class AudioDataset(Dataset):
                for i, segment in enumerate(seg_group):
                    offset = int(float(segment["offset"]) * sample_rate)
                    n_frames = int(float(segment["duration"]) * sample_rate)
-                    # _id = f"{split}_{wav_path.stem}_{i}"
-                    _id = f"{wav_path.stem}_{i}"
+                    _id = f"{split}_{wav_path.stem}_{i}"
+                    # _id = f"{wav_path.stem}_{i}"

                    item = dict()
                    item["audio"] = wav_path.as_posix()
@@ -237,7 +237,7 @@ def process(args):
        if not Path.exists(zip_path) or args.overwrite:
            gen_feature_flag = True

-    if True and gen_feature_flag:
+    if gen_feature_flag:
        if args.speed_perturb:
            feature_root = output_root / "fbank80_sp"
        else:
@@ -264,12 +264,8 @@ def process(args):

                utt_id = item['id']
                features_path = (feature_root / f"{utt_id}.npy").as_posix()
-                tag_features_path = (feature_root / f"{split}_{utt_id}.npy").as_posix()

-                if os.path.exists(tag_features_path):
-                    continue
-                if os.path.exists(features_path) and not os.path.exists(tag_features_path):
-                    shutil.move(features_path, tag_features_path)
+                if os.path.exists(features_path):
                    continue

                waveform, sample_rate, _ = dataset.get(idx, need_waveform=True)

--- a/examples/speech_to_text/prep_mt_data.py
+++ b/examples/speech_to_text/prep_mt_data.py
@@ -96,16 +96,19 @@ def process(args):
            tgt_train_text.extend(manifest["tgt_text"])

    # Generate vocab and yaml
-    v_size_str = "" if args.vocab_type == "char" else str(args.vocab_size)
-    spm_filename_prefix = f"spm_{args.vocab_type}{v_size_str}"
+    tgt_v_size_str = "" if args.tgt_vocab_type == "char" else str(args.tgt_vocab_size)
+    tgt_spm_filename_prefix = f"spm_{args.tgt_vocab_type}{tgt_v_size_str}"

    if args.share:
        tgt_train_text.extend(src_train_text)
-        src_spm_filename_prefix = spm_filename_prefix + "_share"
-        tgt_spm_filename_prefix = src_spm_filename_prefix
+        tgt_spm_filename_prefix = tgt_spm_filename_prefix + "_share"
+        src_spm_filename_prefix = tgt_spm_filename_prefix
    else:
-        src_spm_filename_prefix = spm_filename_prefix + "_" + src_lang
-        tgt_spm_filename_prefix = spm_filename_prefix + "_" + tgt_lang
+        src_v_size_str = "" if args.src_vocab_type == "char" else str(args.src_vocab_size)
+        src_spm_filename_prefix = f"spm_{args.src_vocab_type}{src_v_size_str}"
+
+        src_spm_filename_prefix = src_spm_filename_prefix + "_" + src_lang
+        tgt_spm_filename_prefix = tgt_spm_filename_prefix + "_" + tgt_lang

    with NamedTemporaryFile(mode="w") as f:
        for t in tgt_train_text:
@@ -113,8 +116,8 @@ def process(args):
        gen_vocab(
            Path(f.name),
            output_root / tgt_spm_filename_prefix,
-            args.vocab_type,
-            args.vocab_size,
+            args.tgt_vocab_type,
+            args.tgt_vocab_size,
            normalization_rule_name="identity" if tgt_lang == "zh" else None
        )

@@ -125,8 +128,8 @@ def process(args):
            gen_vocab(
                Path(f.name),
                output_root / src_spm_filename_prefix,
-                args.vocab_type,
-                args.vocab_size,
+                args.src_vocab_type,
+                args.src_vocab_size,
                normalization_rule_name="identity" if tgt_lang == "zh" else None
            )

@@ -135,7 +138,7 @@ def process(args):
    if args.share:
        yaml_filename = f"config_share.yaml"

-    conf = {}
+    conf = dict()
    conf["src_vocab_filename"] = src_spm_filename_prefix + ".txt"
    conf["tgt_vocab_filename"] = tgt_spm_filename_prefix + ".txt"
    conf["src_bpe_tokenizer"] = {
@@ -157,13 +160,21 @@ def main():
    parser.add_argument("--data-root", "-d", required=True, type=str)
    parser.add_argument("--output-root", "-o", default=None, type=str)
    parser.add_argument(
-        "--vocab-type",
+        "--src-vocab-type",
+        default="unigram",
+        required=True,
+        type=str,
+        choices=["bpe", "unigram", "char"],
+    )
+    parser.add_argument(
+        "--tgt-vocab-type",
        default="unigram",
        required=True,
        type=str,
        choices=["bpe", "unigram", "char"],
-    ),
-    parser.add_argument("--vocab-size", default=10000, type=int)
+    )
+    parser.add_argument("--src-vocab-size", default=10000, type=int)
+    parser.add_argument("--tgt-vocab-size", default=10000, type=int)
    parser.add_argument("--size", default=-1, type=int)
    parser.add_argument("--splits", default="train,dev,test", type=str)
    parser.add_argument("--lowercase-src", action="store_true", help="lowercase the source text")

--- a/fairseq/checkpoint_utils.py
+++ b/fairseq/checkpoint_utils.py
@@ -704,6 +704,8 @@ def load_pretrained_component_from_model(
        if key.startswith(component_type):
            # encoder.input_layers.0.0.weight --> input_layers.0.0.weight
            component_subkey = key[len(component_type) + 1:]
+            if component_subkey.startswith(component_type):
+                component_subkey = component_subkey[len(component_type) + 1:]
            component_state_dict[component_subkey] = state["model"][key]

    mismatch_keys = []

--- a/fairseq/models/speech_to_text/modules/adapter.py
+++ b/fairseq/models/speech_to_text/modules/adapter.py
@@ -91,7 +91,7 @@ class Adapter(nn.Module):
            logger.info("CTC Compress Strategy: %s" % strategy)
        elif self.adapter_type == "league":
            self.distribution_cutoff = strategy
-            if self.distribution_cutoff != -1:
+            if self.distribution_cutoff is not None:
                logger.info("Distribution cutoff: %d" % int(strategy))

    def forward(self, x, padding):
@@ -112,7 +112,7 @@ class Adapter(nn.Module):

        elif self.adapter_type == "league":
            linear_out = self.linear_adapter(representation)
-            if self.distribution_cutoff != -1:
+            if self.distribution_cutoff is not None:
                cutoff = min(int(self.distribution_cutoff), distribution.size(-1) - 1)
                threshold = distribution.sort(dim=-1, descending=True)[0][:, :, cutoff:cutoff+1]
                distribution = torch.where(distribution > threshold, distribution, torch.zeros_like(distribution))

--- a/fairseq/models/speech_to_text/pdss2t_transformer.py
+++ b/fairseq/models/speech_to_text/pdss2t_transformer.py
@@ -192,9 +192,34 @@ class PDSS2TTransformerModel(S2TTransformerModel):
                "rel_pos",
                "rope",
                "abs",
+                "transfer",
            ],
            help="transformer encoder self-attention layer type"
        )
+        # transfer
+        parser.add_argument(
+            "--relative-pos-enc",
+            action="store_true",
+            help="use relative position encoding for attention",
+        )
+        parser.add_argument(
+            "--linear-att",
+            action="store_true",
+            help="use linear attention",
+        )
+
+        # reduced attention
+        parser.add_argument(
+            "--attention-reduced-method",
+            type=str,
+            default="conv",
+            help="reduction method for attention",
+        )
+        parser.add_argument(
+            "--attention-reduced-q",
+            action="store_true",
+            help="use reduction for query or not"
+        )
        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
@@ -450,9 +475,9 @@ class PDSS2TTransformerModel(S2TTransformerModel):
            help="the number of the attention heads in each stage",
        )
        parser.add_argument(
-            "--pds-attn-ds-ratio",
+            "--pds-attn-ds-ratios",
            type=str,
-            help="the ratio of the down-sampling in the self attention module",
+            help="the ratios of the down-sampling in the self attention module",
        )
        parser.add_argument(
            "--pds-ffn-ratios",
@@ -495,7 +520,7 @@ class PDSS2TTransformerModel(S2TTransformerModel):
        )
        parser.add_argument(
            "--intermedia-distribution-cutoff",
-            default=-1,
+            default=None,
            type=int,
            help="cutoff of the distribution",
        )
@@ -641,7 +666,7 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                pos_embed = None

            stage = nn.ModuleList([
-                PDSTransformerEncoderLayer(args, embed_dim, embed_dim * ffn_ratio, num_head, attn_ds_ratio)
+                PDSTransformerEncoderLayer(args, embed_dim, ffn_ratio, num_head, attn_ds_ratio)
                for _ in range(num_layers)])

            # representation fusion
@@ -735,9 +760,12 @@ class PDSS2TTransformerEncoder(FairseqEncoder):
                       (("ctc" in getattr(args, "criterion", "")) and
                        (getattr(args, "ctc_weight", False) > 0))
        if self.use_ctc:
-            self.ctc_layer = (args.ctc_layer + args.encoder_layers) % args.encoder_layers
-            self.ctc_layer = args.encoder_layers if self.ctc_layer == 0 else self.ctc_layer
-            self.inter_ctc = True if self.ctc_layer != args.encoder_layers or self.fusion_stages_num != 0 else False
+            # self.ctc_layer = (args.ctc_layer + args.encoder_layers) % args.encoder_layers
+            # self.ctc_layer = args.encoder_layers if self.ctc_layer == 0 else self.ctc_layer
+            # self.inter_ctc = True if self.ctc_layer != args.encoder_layers or self.fusion_stages_num != 0 else False
+
+            self.ctc_layer = args.ctc_layer
+            self.inter_ctc = True if self.ctc_layer != 0 else False
            if self.inter_ctc:
                logger.info("Intermedia CTC loss in layer %d" % self.ctc_layer)

@@ -1027,7 +1055,7 @@ def base_architecture(args):
    # intermedia CTC
    args.pds_ctc = getattr(args, "pds_ctc", "0_0_0_0")
    args.intermedia_adapter = getattr(args, "intermedia_adapter", "none")
-    args.ctc_self_distill = getattr(args, "ctc_self_distill", False)
+    args.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)


 def set_pds_base_8(args):

--- a/fairseq/models/speech_to_text/s2t_ctc.py
+++ b/fairseq/models/speech_to_text/s2t_ctc.py
@@ -131,11 +131,34 @@ class S2TCTCModel(FairseqEncoderModel):
                "relative",
                "rel_pos",
                "rope",
-                "abs"
+                "abs",
+                "transfer",
            ],
            help="transformer encoder self-attention layer type"
        )
        parser.add_argument(
+            "--relative-pos-enc",
+            action="store_true",
+            help="use relative position encoding for attention",
+        )
+        parser.add_argument(
+            "--linear-att",
+            action="store_true",
+            help="use linear attention",
+        )
+
+        parser.add_argument(
+            "--attention-reduced-method",
+            type=str,
+            default="conv",
+            help="reduction method for attention",
+        )
+        parser.add_argument(
+            "--attention-reduced-q",
+            action="store_true",
+            help="use reduction for query or not",
+        )
+        parser.add_argument(
            "--encoder-attention-heads",
            type=int,
            metavar="N",
@@ -412,7 +435,7 @@ class S2TCTCModel(FairseqEncoderModel):
            help="the number of the attention heads in each stage",
        )
        parser.add_argument(
-            "--pds-attn-ds-ratio",
+            "--pds-attn-ds-ratios",
            type=str,
            help="the ratio of the down-sampling in the self attention module",
        )
@@ -457,7 +480,7 @@ class S2TCTCModel(FairseqEncoderModel):
        )
        parser.add_argument(
            "--intermedia-distribution-cutoff",
-            default=-1,
+            default=None,
            type=int,
            help="cutoff of the distribution",
        )
@@ -931,6 +954,26 @@ def base_architecture(args):
    args.cl_dropout_epoch = getattr(args, "cl_dropout_epoch", None)
    args.cl_dropout_strategy = getattr(args, "cl_dropout_strategy", "linear")

+    # PDS
+    args.pds_stages = getattr(args, "pds_stages", None)
+    args.pds_layers = getattr(args, "pds_layers", None)
+    args.pds_ratios = getattr(args, "pds_ratios", None)
+
+    args.pds_ds_method = getattr(args, "pds_ds_method", "conv")
+    args.pds_embed_dims = getattr(args, "pds_embed_dims", None)
+    args.pds_embed_norm = getattr(args, "pds_embed_norm", True)
+    args.pds_position_embed = getattr(args, "pds_position_embed", None)
+
+    args.pds_attn_heads = getattr(args, "pds_attn_heads", None)
+    args.pds_attn_ds_ratios = getattr(args, "pds_attn_ds_ratios", None)
+    args.pds_ffn_ratios = getattr(args, "pds_ffn_ratios", None)
+
+    args.ctc_layer = getattr(args, "ctc_layer", 0)
+    args.pds_dropout = getattr(args, "pds_dropout", args.dropout)
+
+    args.pds_fusion = getattr(args, "pds_fusion", False)
+    args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
+
    # intermedia CTC
    args.intermedia_ctc_layers = getattr(args, "intermedia_ctc_layers", None)
    args.intermedia_adapter = getattr(args, "intermedia_adapter", None)

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -5,13 +5,11 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from fairseq import checkpoint_utils
-from fairseq.data.data_utils import lengths_to_padding_mask
 from fairseq.models import (
    FairseqEncoder,
    register_model,
    register_model_architecture,
 )
-from fairseq.models.transformer import Embedding, TransformerDecoder
 from fairseq.models.speech_to_text import (
    S2TTransformerModel,
    S2TTransformerEncoder,
@@ -314,12 +312,12 @@ class S2TSATEEncoder(FairseqEncoder):
        if args.adapter == "shrink":
            strategy = getattr(args, "ctc_compress_strategy", "avg")
        elif args.adapter == "league":
-            strategy = getattr(args, "intermedia_distribution_cutoff", -1)
+            strategy = getattr(args, "intermedia_distribution_cutoff", None)

        self.adapter = Adapter(args.encoder_embed_dim,
                               args.adapter,
                               task.source_dictionary,
-                               embed_tokens,
+                               embed_tokens if task.source_dictionary == task.target_dictionary else None,
                               strategy=strategy)

        if args.share_ctc_and_adapter and hasattr(self.adapter, "embed_adapter"):

--- a/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/fairseq/models/speech_to_text/s2t_transformer.py
@@ -385,7 +385,7 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
        )
        parser.add_argument(
            "--intermedia-distribution-cutoff",
-            default=-1,
+            default=None,
            type=int,
            help="cutoff of the distribution",
        )
@@ -581,7 +581,7 @@ class S2TTransformerEncoder(FairseqEncoder):
            if args.intermedia_adapter == "shrink":
                strategy = getattr(args, "ctc_compress_strategy", None)
            elif args.intermedia_adapter == "league":
-                strategy = getattr(args, "intermedia_distribution_cutoff", -1)
+                strategy = getattr(args, "intermedia_distribution_cutoff", None)
            self.adapter = Adapter(dim, args.intermedia_adapter,
                                   task.source_dictionary, strategy=strategy)
            self.intermedia_drop_prob = getattr(args, "intermedia_drop_prob", 0)

--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -8,6 +8,7 @@ from .squeeze_excitation import SEAttention
 from .activations import swish, Swish
 from .adaptive_input import AdaptiveInput
 from .adaptive_softmax import AdaptiveSoftmax
+from .attention import MultiHeadSelfAttentionModule
 from .beamable_mm import BeamableMM
 from .character_token_embedder import CharacterTokenEmbedder
 from .downsample_convolution import DownSampleConvolutionModule
@@ -91,6 +92,7 @@ __all__ = [
    "LinearizedConvolution",
    "LocalMultiheadAttention",
    "MultiheadAttention",
+    "MultiHeadSelfAttentionModule",
    "PositionalEmbedding",
    "PDSTransformerEncoderLayer",
    "ReducedMultiheadAttention",

--- a/fairseq/modules/activations.py
+++ b/fairseq/modules/activations.py
@@ -43,6 +43,7 @@ def get_activation_class(activation: str, dim=None):
    else:
        raise RuntimeError("activation function {} not supported".format(activation))

+
 def swish(x: torch.Tensor) -> torch.Tensor:
    return x * torch.sigmoid(x)


--- a/fairseq/modules/attention.py
+++ b/fairseq/modules/attention.py
@@ -281,7 +281,7 @@ class StridedMultiHeadAttention(MultiHeadAttention):
    """Strided Multi-Head Attention Layer

    Strided multi-head attention performs global sequence downsampling by striding
-    the attention query before aplying scaled dot-product attention. This results in
+    the attention query before applying scaled dot-product attention. This results in
    strided attention maps where query positions can attend to the entire sequence
    context to perform downsampling.

@@ -1321,7 +1321,7 @@ class MultiHeadSelfAttentionModule(nn.Module):
    Args:
        dim_model: model feature dimension
        num_heads: number of attention heads
-        Pdrop: residual dropout probability
+        dropout: residual dropout probability
        max_pos_encoding: maximum position
        relative_pos_enc: whether to use relative postion embedding
        causal: True for causal attention with masked future context
@@ -1335,14 +1335,14 @@ class MultiHeadSelfAttentionModule(nn.Module):
    def __init__(self,
                 dim_model,
                 num_heads,
-                 Pdrop,
+                 dropout,
                 max_pos_encoding,
-                 relative_pos_enc,
-                 causal,
-                 group_size,
-                 kernel_size,
-                 stride,
-                 linear_att):
+                 relative_pos_enc=False,
+                 causal=False,
+                 group_size=1,
+                 kernel_size=None,
+                 stride=1,
+                 linear_att=False):
        super(MultiHeadSelfAttentionModule, self).__init__()

        # Assert
@@ -1351,7 +1351,7 @@ class MultiHeadSelfAttentionModule(nn.Module):
        assert not (linear_att and relative_pos_enc), "Linear attention requires absolute positional encodings"

        # Pre Norm
-        self.norm = nn.LayerNorm(dim_model, eps=1e-6)
+        # self.norm = nn.LayerNorm(dim_model, eps=1e-6)

        # Multi-Head Linear Attention
        if linear_att:
@@ -1394,7 +1394,7 @@ class MultiHeadSelfAttentionModule(nn.Module):
                self.mhsa = MultiHeadAttention(dim_model, num_heads)

        # Dropout
-        self.dropout = nn.Dropout(Pdrop)
+        # self.dropout = nn.Dropout(Pdrop)

        # Module Params
        self.rel_pos_enc = relative_pos_enc
@@ -1402,8 +1402,9 @@ class MultiHeadSelfAttentionModule(nn.Module):

    def forward(self, x, mask=None, hidden=None):

-        # Pre Norm
-        x = self.norm(x)
+        x = x.transpose(0, 1)
+        if mask is not None:
+            mask = mask.view(mask.size(0), 1, 1, mask.size(-1))

        # Multi-Head Self-Attention
        if self.linear_att:
@@ -1414,6 +1415,7 @@ class MultiHeadSelfAttentionModule(nn.Module):
            x, attention = self.mhsa(x, x, x, mask)

        # Dropout
-        x = self.dropout(x)
+        # x = self.dropout(x)

-        return x, attention, hidden
+        x = x.transpose(0, 1)
+        return x, attention
--- a/fairseq/modules/convolution.py
+++ b/fairseq/modules/convolution.py
@@ -10,17 +10,18 @@ class ConvolutionModule(nn.Module):
    def __init__(
        self,
        embed_dim,
-        channels,
+        expand_embed_dim,
        depthwise_kernel_size,
        dropout,
        activation_fn="swish",
        bias=False,
+        stride=1,
        export=False,
    ):
        """
        Args:
            embed_dim: Embedding dimension
-            channels: Number of channels in depthwise conv layers
+            expand_embed_dim: Number of output embedding dimension
            depthwise_kernel_size: Depthwise conv layer kernel size
            dropout: dropout value
            activation_fn: Activation function to use after depthwise convolution kernel
@@ -33,7 +34,7 @@ class ConvolutionModule(nn.Module):
        ) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
        self.pointwise_conv1 = torch.nn.Conv1d(
            embed_dim,
-            2 * channels,
+            2 * expand_embed_dim,
            kernel_size=1,
            stride=1,
            padding=0,
@@ -41,19 +42,19 @@ class ConvolutionModule(nn.Module):
        )
        self.glu = torch.nn.GLU(dim=1)
        self.depthwise_conv = torch.nn.Conv1d(
-            channels,
-            channels,
+            expand_embed_dim,
+            expand_embed_dim,
            depthwise_kernel_size,
-            stride=1,
+            stride=stride,
            padding=(depthwise_kernel_size - 1) // 2,
-            groups=channels,
+            groups=expand_embed_dim,
            bias=bias,
        )
-        self.batch_norm = nn.BatchNorm1d(channels)
+        self.batch_norm = nn.BatchNorm1d(expand_embed_dim)
        self.activation = get_activation_class(activation_fn)
        self.pointwise_conv2 = torch.nn.Conv1d(
-            channels,
-            embed_dim,
+            expand_embed_dim,
+            expand_embed_dim,
            kernel_size=1,
            stride=1,
            padding=0,
@@ -72,8 +73,8 @@ class ConvolutionModule(nn.Module):
        x = x.transpose(1, 2)

        # GLU mechanism
-        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
-        x = self.glu(x)  # (batch, channel, dim)
+        x = self.pointwise_conv1(x)  # (batch, 2*expand_embed_dim, dim)
+        x = self.glu(x)  # (batch, expand_embed_dim, dim)

        # 1D Depthwise Conv
        x = self.depthwise_conv(x)
@@ -81,10 +82,13 @@ class ConvolutionModule(nn.Module):
        x = self.activation(x)

        x = self.pointwise_conv2(x)
+
+        x = x.transpose(1, 2)
        x = self.dropout(x)
-        return x.transpose(1, 2)

-#
+        return x
+
+
 # class ConvolutionModule(nn.Module):
 #     """ConvolutionModule in Conformer model."""
 #     def __init__(self,

--- a/fairseq/modules/pds_layer.py
+++ b/fairseq/modules/pds_layer.py
 from typing import Optional

 import torch
+from torch import Tensor
 import torch.nn as nn
+
 from fairseq.modules import (
    LayerNorm,
    MultiheadAttention,
@@ -14,10 +16,11 @@ from fairseq.modules import (
    LocalMultiheadAttention,
    ReducedMultiheadAttention,
    RotaryPositionMultiHeadedAttention,
+    MultiHeadSelfAttentionModule,
 )
 from fairseq.modules.s2t_transformer_layer import FeedForwardModule
 from fairseq.modules.fairseq_dropout import FairseqDropout
-from torch import Tensor
+from .utils import Transpose, Permute3D


 class PDSTransformerEncoderLayer(nn.Module):
@@ -35,29 +38,48 @@ class PDSTransformerEncoderLayer(nn.Module):
        args (argparse.Namespace): parsed command-line arguments
    """

-    def __init__(self, args, embed_dim, ffn_embed_dim, num_head, att_sample_ratio=1):
+    def __init__(self, args,
+                 embed_dim,
+                 ffn_ratio,
+                 num_head,
+                 attn_sample_ratio=1,
+                 attn_stride=1,
+                 conv_stride=1,
+                 expand_embed_dim=None):
        super().__init__()
        self.args = args

        embed_dim = embed_dim
-        ffn_dim = args.encoder_ffn_embed_dim
        dropout = args.dropout

-        self.quant_noise = getattr(args, 'quant_noise_pq', 0)
-        self.quant_noise_block_size = getattr(args, 'quant_noise_pq_block_size', 8) or 8
-        self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
-        self.self_attn = self.build_self_attention(args, embed_dim, num_head, att_sample_ratio)
-        self.self_attn_layer_norm = LayerNorm(embed_dim)
+        if expand_embed_dim is None:
+            expand_embed_dim = embed_dim
+
        self.dropout_module = FairseqDropout(
            dropout, module_name=self.__class__.__name__
        )
+        self.quant_noise = getattr(args, 'quant_noise_pq', 0)
+        self.quant_noise_block_size = getattr(args, 'quant_noise_pq_block_size', 8) or 8
+
        self.normalize_before = args.encoder_normalize_before
        activation = getattr(args, 'encoder_activation_fn', 'relu')

+        # attention
+        self.attn_type = getattr(args, "encoder_attention_type", "selfattn")
+        self.self_attn = self.build_self_attention(args, embed_dim, num_head, attn_sample_ratio)
+        self.self_attn_layer_norm = LayerNorm(embed_dim)
+
+        # Attention Residual
+        self.attn_res = nn.Sequential(
+            Permute3D(1, 2, 0),
+            nn.MaxPool1d(kernel_size=1, stride=attn_stride),
+            Permute3D(2, 0, 1)
+        ) if attn_stride > 1 else nn.Identity()
+
        if args.macaron_style:
            self.macaron_ffn = FeedForwardModule(
                embed_dim,
-                ffn_dim,
+                embed_dim * ffn_ratio,
                dropout,
                dropout,
                activation
@@ -73,24 +95,37 @@ class PDSTransformerEncoderLayer(nn.Module):
            self.conv_norm = LayerNorm(embed_dim)
            self.conv_module = ConvolutionModule(
                embed_dim,
-                embed_dim,
+                expand_embed_dim,
                depthwise_kernel_size=args.cnn_module_kernel,
                dropout=args.dropout,
-                activation_fn=getattr(args, 'activation_fn', 'swish'))
-            self.final_norm = LayerNorm(embed_dim)
+                activation_fn=activation,
+                stride=conv_stride
+            )
+            self.final_norm = LayerNorm(expand_embed_dim)
+
+            # Convolution Residual
+            self.conv_res = nn.Sequential(
+                Permute3D(1, 2, 0),
+                nn.Conv1d(embed_dim, expand_embed_dim, kernel_size=1, stride=conv_stride),
+                Permute3D(2, 0, 1)
+            ) if embed_dim != expand_embed_dim else nn.Sequential(
+                Permute3D(1, 2, 0),
+                nn.MaxPool1d(kernel_size=1, stride=conv_stride),
+                Permute3D(2, 0, 1)
+            ) if conv_stride > 1 else nn.Identity()
        else:
            self.conv_norm = None
            self.conv_module = None
            self.final_norm = None

        self.ffn = FeedForwardModule(
-                embed_dim,
-                ffn_dim,
+                expand_embed_dim,
+                expand_embed_dim * ffn_ratio,
                dropout,
                dropout,
                activation
            )
-        self.ffn_norm = LayerNorm(embed_dim)
+        self.ffn_norm = LayerNorm(expand_embed_dim)

    def build_self_attention(self, args, embed_dim, num_head, sample_ratio=1):
        attention_heads = num_head
@@ -165,6 +200,17 @@ class PDSTransformerEncoderLayer(nn.Module):
                q_noise=self.quant_noise,
                qn_block_size=self.quant_noise_block_size,
                sample_ratio=sample_ratio,
+                reduced_method=getattr(args, "attention_reduced_method", "conv"),
+                reduced_q=getattr(args, "attention_reduced_q", False)
+            )
+        elif self.attn_type == "transfer":
+            return MultiHeadSelfAttentionModule(
+                embed_dim,
+                attention_heads,
+                dropout,
+                max_pos_encoding=args.max_source_positions,
+                relative_pos_enc=getattr(args, "relative_pos_enc", False),
+                linear_att=getattr(args, "linear_att", False),
            )
        else:
            print("The encoder attention type %s is not supported!" % self.attn_type)
@@ -248,6 +294,10 @@ class PDSTransformerEncoderLayer(nn.Module):
                attn_mask=attn_mask,
                pos_emb=pos_emb
            )
+        elif self.attn_type == "transfer":
+            x, _ = self.self_attn(
+                x, encoder_padding_mask
+            )
        else:
            x, _ = self.self_attn(
                query=x,
@@ -258,7 +308,7 @@ class PDSTransformerEncoderLayer(nn.Module):
                attn_mask=attn_mask,
            )
        x = self.dropout_module(x)
-        x = self.residual_connection(x, residual)
+        x = self.residual_connection(self.attn_res(x), residual)
        if not self.normalize_before:
            x = self.self_attn_layer_norm(x)


--- a/fairseq/modules/reduced_multihead_attention.py
+++ b/fairseq/modules/reduced_multihead_attention.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
 import math
 from typing import Dict, Optional, Tuple

 import torch
-import torch.nn.functional as F
 from fairseq import utils
 from fairseq.incremental_decoding_utils import with_incremental_state
 from fairseq.modules.fairseq_dropout import FairseqDropout
+from fairseq.modules.layer_norm import LayerNorm
 from fairseq.modules.quant_noise import quant_noise
 from torch import Tensor, nn
 from torch.nn import Parameter
@@ -38,6 +33,8 @@ class ReducedMultiheadAttention(nn.Module):
        q_noise=0.0,
        qn_block_size=8,
        sample_ratio=1,
+        reduced_method="conv",
+        reduced_q=False,
    ):
        super().__init__()
        self.embed_dim = embed_dim
@@ -85,13 +82,25 @@ class ReducedMultiheadAttention(nn.Module):

        self.add_zero_attn = add_zero_attn
        self.sample_ratio = sample_ratio
+        self.reduced_method = reduced_method
+        self.reduced_q = reduced_q
+        if reduced_q:
+            assert self.reduced_method == 'group', "only support grouped method for query reduction"
+
        if self.sample_ratio > 1:
+            if reduced_method == "conv":
                self.sr = nn.Conv1d(embed_dim, embed_dim,
                                    kernel_size=sample_ratio,
                                    stride=sample_ratio,
                                    # padding=(sample_ratio - 1) // 2
                                    )
-            self.norm = nn.LayerNorm(embed_dim)
+                self.norm = LayerNorm(embed_dim)
+            elif reduced_method == "pool":
+                self.linear = nn.Linear(embed_dim, embed_dim)
+                self.norm = LayerNorm(embed_dim)
+                self.act = nn.GELU()
+            elif reduced_method == "group":
+                pass

        self.reset_parameters()

@@ -159,41 +168,6 @@ class ReducedMultiheadAttention(nn.Module):
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]

-        if (
-            self.sample_ratio == 1 and
-            not self.onnx_trace
-            and not is_tpu  # don't use PyTorch version on TPUs
-            and incremental_state is None
-            and not static_kv
-            # A workaround for quantization to work. Otherwise JIT compilation
-            # treats bias in linear module as method.
-            and not torch.jit.is_scripting()
-        ):
-            assert key is not None and value is not None
-            return F.multi_head_attention_forward(
-                query,
-                key,
-                value,
-                self.embed_dim,
-                self.num_heads,
-                torch.empty([0]),
-                torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
-                self.bias_k,
-                self.bias_v,
-                self.add_zero_attn,
-                self.dropout_module.p,
-                self.out_proj.weight,
-                self.out_proj.bias,
-                self.training or self.dropout_module.apply_during_inference,
-                key_padding_mask,
-                need_weights,
-                attn_mask,
-                use_separate_proj_weight=True,
-                q_proj_weight=self.q_proj.weight,
-                k_proj_weight=self.k_proj.weight,
-                v_proj_weight=self.v_proj.weight,
-            )
-
        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if saved_state is not None and "prev_key" in saved_state:
@@ -205,16 +179,41 @@ class ReducedMultiheadAttention(nn.Module):
        else:
            saved_state = None

-        q = self.q_proj(query)
+        # only support self attention
        if self.self_attention:
+            query_ = query
            if self.sample_ratio > 1:
-                query_ = query.permute(1, 2, 0)    # bsz, dim, seq_len:
+                assert tgt_len % self.sample_ratio == 0, \
+                    ("sample ratio %d is mismatched with length %d" % (self.sample_ratio, tgt_len))
+                if self.reduced_method == "conv":
+                    query_ = query.permute(1, 2, 0)    # bsz, dim, seq_len
                    query_ = self.sr(query_).permute(2, 0, 1)  # seq_len, bsz, dim
-                query = self.norm(query_)
+                    query_ = self.norm(query_)
+                elif self.reduced_method == "pool":
+                    query_ = query.permute(1, 2, 0)    # bsz, dim, seq_len:
+                    pool_length = int(tgt_len / self.sample_ratio)
+                    query_ = nn.functional.adaptive_max_pool1d(query_, pool_length).permute(2, 0, 1)
+                    query_ = self.act(self.norm(query_))
+
+            if self.reduced_q:
+                q = self.q_proj(query_)
+                tgt_len = int(tgt_len / self.sample_ratio)
+            else:
+                q = self.q_proj(query)
+
+            k = self.k_proj(query_)
+            v = self.v_proj(query_)
+
+            if self.sample_ratio > 1 and self.reduced_method == "group":
+                assert self.reduced_q is True
+                self.head_dim *= self.sample_ratio
+                q = q.transpose(0, 1).contiguous().view(bsz, -1, self.embed_dim * self.sample_ratio).transpose(0, 1)
+
+                k = q.transpose(0, 1).view(bsz, -1, self.embed_dim * self.sample_ratio).transpose(0, 1)
+                v = q.transpose(0, 1).view(bsz, -1, self.embed_dim * self.sample_ratio).transpose(0, 1)

-            k = self.k_proj(query)
-            v = self.v_proj(query)
        elif self.encoder_decoder_attention:
+            q = self.q_proj(query)
            # encoder-decoder attention
            if key is None:
                assert value is None
@@ -224,10 +223,12 @@ class ReducedMultiheadAttention(nn.Module):
                v = self.v_proj(key)

        else:
+            q = self.q_proj(query)
            assert key is not None and value is not None
            k = self.k_proj(key)
            v = self.v_proj(value)
-        q *= self.scaling
+        # q *= self.scaling
+        q *= (self.head_dim ** -0.5)

        if self.bias_k is not None:
            assert self.bias_v is not None
@@ -313,13 +314,15 @@ class ReducedMultiheadAttention(nn.Module):
        if key_padding_mask is not None:

            if self.sample_ratio > 1:
-                lengths = (~key_padding_mask).sum(-1)
-                lengths = (lengths / self.sample_ratio).long()
-                # lengths = ((lengths.float() - 1) / self.sample_ratio + 1).floor().long()
-                max_length = src_len
-                assert max_length >= max(lengths), (max_length, max(lengths))
-                mask = torch.arange(max_length).to(lengths.device).view(1, max_length)
-                key_padding_mask = mask.expand(bsz, -1) >= lengths.view(bsz, 1).expand(-1, max_length)
+                key_padding_mask = key_padding_mask[:, ::self.sample_ratio]
+
+                # lengths = (~key_padding_mask).sum(-1)
+                # lengths = (lengths / self.sample_ratio).long()
+                # # lengths = ((lengths.float() - 1) / self.sample_ratio + 1).floor().long()
+                # max_length = src_len
+                # assert max_length >= max(lengths), (max_length, max(lengths))
+                # mask = torch.arange(max_length).to(lengths.device).view(1, max_length)
+                # key_padding_mask = mask.expand(bsz, -1) >= lengths.view(bsz, 1).expand(-1, max_length)

            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len
@@ -380,6 +383,10 @@ class ReducedMultiheadAttention(nn.Module):

        assert v is not None
        attn = torch.bmm(attn_probs, v)
+        if self.sample_ratio > 1 and self.reduced_q:
+            tgt_len = attn.size(1) * self.sample_ratio
+            self.head_dim = int(self.head_dim / self.sample_ratio)
+            attn = attn.view(bsz * self.num_heads, tgt_len, self.head_dim)
        assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        if self.onnx_trace and attn.size(1) == 1:
            # when ONNX tracing a single decoder step (sequence length == 1)

--- a/fairseq/modules/utils.py
+++ b/fairseq/modules/utils.py
+import torch
+from torch import nn as nn
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        return x.transpose(self.dim0, self.dim1)
+
+
+class Permute3D(nn.Module):
+
+    def __init__(self, dim0, dim1, dim2):
+        super(Permute3D, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+        self.dim2 = dim2
+
+    def forward(self, x):
+        return x.permute(self.dim0, self.dim1, self.dim2)