Cumulative updates. I mainly optimize the shell scripts and support the new…

Cumulative updates. I mainly optimize the shell scripts and support the new benchmarks. It is more friendly to MT researchers (also including me). I also improve the code. Of course, old problems still remain and new problems arise. Just keep coding.

Cumulative updates. I mainly optimize the shell scripts and support the new…
Cumulative updates. I mainly optimize the shell scripts and support the new benchmarks. It is more friendly to MT researchers (also including me). I also improve the code. Of course, old problems still remain and new problems arise. Just keep coding.
2215ade0 · xuchen · a2353895 · 2215ade0 · 2215ade0 · 2215ade0
Commit 2215ade0 authored Jan 08, 2022 by xuchen
--- a/egs/aishell/asr/conf/base.yaml
+++ b/egs/aishell/asr/conf/base.yaml
@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
 attention-dropout: 0.1
 activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/aishell/asr/conf/big.yaml
+++ b/egs/aishell/asr/conf/big.yaml
+arch: s2t_transformer_m
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 1e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.15
+activation-fn: relu
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/aishell/asr/conf/ctc.yaml
+++ b/egs/aishell/asr/conf/ctc.yaml
 ctc-weight: 0.3
 post-process: sentencepiece
\ No newline at end of file
--- a/egs/aishell/asr/conf/pds_base.yaml
+++ b/egs/aishell/asr/conf/pds_base.yaml
@@ -24,3 +24,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/aishell/asr/conf/pds_base_16.yaml
+++ b/egs/aishell/asr/conf/pds_base_16.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/aishell/asr/conf/pds_base_32.yaml
+++ b/egs/aishell/asr/conf/pds_base_32.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/aishell/asr/conf/pds_base_8.yaml
+++ b/egs/aishell/asr/conf/pds_base_8.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/librispeech/asr/conf/debug.yaml
+++ b/egs/librispeech/asr/conf/debug.yaml
-#arch: pdss2t_transformer_s
+arch: pdss2t_transformer_m_8
-#arch: s2t_transformer_s
-arch: s2t_sate
+encoder-embed-dim: 512
-encoder-embed-dim: 256
 pds-stages: 4
-#pds-dropout: 0
+ctc-layer: 12
-pds-layers: 2_2_6_2
+pds-layers: 3_3_3_3
-pds-ratios: 2_2_2_2
+pds-ratios: 2_2_1_2
 pds-fusion: True
 pds-fusion-method: all_conv
-pds-embed-dims: 256_256_256_256
+pds-embed-dims: 512_512_512_512
 pds-ds-method: conv
 pds-embed-norm: True
 pds-position-embed: 1_1_1_1
 pds-kernel-sizes: 5_5_5_5
-pds-ffn-ratios: 8_8_8_8
+pds-ffn-ratios: 4_4_4_4
-pds-attn-heads: 4_4_4_4
+pds-attn-heads: 8_8_8_8
-cl-dropout: True
-cl-dropout-epoch: 50
-train-subset: train-clean-100
-valid-subset: dev-clean
-max-epoch: 100
-max-update: 300000
-num-workers: 8
-patience: 20
-no-progress-bar: True
-log-interval: 100
-seed: 1
-report-accuracy: True
-#load-pretrained-encoder-from:
-#load-pretrained-decoder-from:
 share-decoder-input-output-embed: True
 optimizer: adam
@@ -45,19 +25,18 @@ lr: 2e-3
 #adam_betas: (0.9,0.98)
 criterion: label_smoothed_cross_entropy_with_ctc
-ctc-weight: 0.3
 label_smoothing: 0.1
-conv-channels: 1024
 dropout: 0.1
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12
 decoder-layers: 6
-encoder-attention-heads: 4
+encoder-attention-heads: 8
-decoder-embed-dim: 256
+decoder-embed-dim: 512
 decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 8
-attention-dropout: 0.1
-activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/aishell/asr/run.sh
+++ b/egs/aishell/asr/run.sh
@@ -44,7 +44,7 @@ use_raw_audio=0
 use_specific_dict=0
 specific_prefix=st
-specific_dir=${root_dir}/data/mustc/st/en-de
+specific_dir=${root_dir}/data/mustc/st
 asr_vocab_prefix=spm_unigram10000_st_share
 org_data_dir=${root_dir}/data/${dataset}
@@ -111,7 +111,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -125,11 +125,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    if [[ ! -e ${data_dir} ]]; then
        mkdir -p ${data_dir}
    fi
-    if [[ ! -f ${data_dir}/fbank80.zip && -f ${data_dir}/../fbank80.zip ]]; then
+    feature_zip=fbank80.zip
-        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        feature_zip=fbank80_sp.zip
    fi
-    if [[ ! -f ${data_dir}/fbank80_sp.zip && -f ${data_dir}/../fbank80_sp.zip ]]; then
+    if [[ ! -f ${data_dir}/${feature_zip} && -f ${data_dir}/../feature_zip ]]; then
-        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
+        ln -s ${data_dir}/../feature_zip ${data_dir}
    fi
    cmd="python ${code_dir}/examples/speech_to_text/prep_audio_data.py
@@ -167,13 +168,9 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval ${cmd}
-    if [[ ! -f ${data_dir}/../fbank80.zip ]]; then
+    if [[ -f ${data_dir}/${feature_zip} && ! -f ${data_dir}/../${feature_zip} ]]; then
-        mv ${data_dir}/fbank80.zip ${data_dir}/..
+        mv ${data_dir}/${feature_zip} ${data_dir}/..
-        ln -s ${data_dir}/../fbank80.zip ${data_dir}
+        ln -s ${data_dir}/../${feature_zip} ${data_dir}
-    fi
-    if [[ ! -f ${data_dir}/../fbank80_sp.zip ]]; then
-        mv ${data_dir}/fbank80_sp.zip ${data_dir}/..
-        ln -s ${data_dir}/../fbank80_sp.zip ${data_dir}
    fi
 fi

--- a/egs/iwslt14/mt/conf/base.yaml
+++ b/egs/iwslt14/mt/conf/base.yaml
 arch: transformer
-share-decoder-input-output-embed: True
+share-all-embeddings: True
 optimizer: adam
 clip-norm: 10.0
 lr-scheduler: inverse_sqrt
@@ -28,5 +28,5 @@ decoder-embed-dim: 512
 decoder-ffn-embed-dim: 1024
 decoder-attention-heads: 4
-load-pretrained-encoder-from:
+#load-pretrained-encoder-from:
-load-pretrained-decoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt14/mt/conf/base_fair.yaml
+++ b/egs/iwslt14/mt/conf/base_fair.yaml
 arch: transformer_iwslt_de_en
-share-decoder-input-output-embed: True
+share-all-embeddings: True
 optimizer: adam
 #clip-norm: 10.0
 lr-scheduler: inverse_sqrt
@@ -27,5 +27,5 @@ decoder-embed-dim: 512
 decoder-ffn-embed-dim: 1024
 decoder-attention-heads: 4
-load-pretrained-encoder-from:
+#load-pretrained-encoder-from:
-load-pretrained-decoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/iwslt14/mt/run.sh
+++ b/egs/iwslt14/mt/run.sh
 #! /bin/bash
-# Processing MuST-C Datasets
+# Processing IWSLT2016 De-En Datasets
 # Copyright 2021 Natural Language Processing Laboratory 
 # Xu Chen (xuchenneu@163.com)
@@ -43,7 +43,7 @@ tokenizer=1
 use_specific_dict=0
 specific_prefix=st
-specific_dir=${root_dir}/data/mustc/st/en-de/
+specific_dir=${root_dir}/data/mustc/st
 src_vocab_prefix=spm_unigram10000_st_share
 tgt_vocab_prefix=spm_unigram10000_st_share
@@ -78,7 +78,7 @@ beam_size=5
 len_penalty=1.0
 if [[ ${use_specific_dict} -eq 1 ]]; then
-    exp_prefix=${specific_prefix}_${exp_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
    data_dir=${data_dir}/${specific_prefix}
    mkdir -p ${data_dir}
 else
@@ -119,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$code_dir/../checkpoints/$dataset/mt/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -332,12 +332,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        # Average models
 		dec_model=avg_${n_average}_checkpoint.pt
-		cmd="python ${code_dir}/scripts/average_checkpoints.py
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
-        --inputs ${model_dir}
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
-        --num-best-checkpoints ${n_average}
+            --inputs ${model_dir}
-        --output ${model_dir}/${dec_model}"
+            --num-best-checkpoints ${n_average}
-    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            --output ${model_dir}/${dec_model}"
-    	[[ $eval -eq 1 ]] && eval $cmd
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
 	else
 		dec_model=${dec_model}
 	fi

--- a/egs/libri_trans/asr/conf/debug.yaml
+++ b/egs/libri_trans/asr/conf/debug.yaml
-arch: s2t_sate
+arch: multi_ctc_s2t_transformer_s
+multi-ctc-layers: 6,8,10,12
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0
@@ -12,47 +13,18 @@ ctc-weight: 0.3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
-encoder-normalize-before: True
-decoder-normalize-before: True
 conv-kernel-sizes: 5,5
 conv-channels: 1024
 dropout: 0.1
 activation-fn: relu
 encoder-embed-dim: 256
 encoder-ffn-embed-dim: 2048
-encoder-layers: 2
+encoder-layers: 12
-text-encoder-layers: 2
+decoder-layers: 6
-decoder-layers: 2
 encoder-attention-heads: 4
-#load-pretrained-encoder-from:
-#load-pretrained-acoustic-encoder-from:
-#load-pretrained-text-encoder-from:
-#load-pretrained-decoder-from:
-#macaron-style: True
-#use-cnn-module: True
-#cnn-module-kernel: 31
-#acoustic-encoder: pds
-acoustic-encoder: transformer
-adapter: shrink
-encoder-embed-dim: 256
-pds-stages: 4
-#pds-dropout: 0
-pds-layers: 3_3_3_3
-pds-ratios: 2_2_1_2
-pds-fusion: True
-pds-fusion-method: all_conv
-pds-embed-dims: 256_256_256_256
-pds-ds-method: conv
-pds-embed-norm: True
-pds-position-embed: 1_1_1_1
-pds-kernel-sizes: 5_5_5_5
-pds-ffn-ratios: 8_8_8_8
-pds-attn-heads: 4_4_4_4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
\ No newline at end of file
+attention-dropout: 0.1
+activation-dropout: 0.1
--- a/egs/libri_trans/asr/run.sh
+++ b/egs/libri_trans/asr/run.sh
@@ -106,7 +106,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"

--- a/egs/libri_trans/mt/run.sh
+++ b/egs/libri_trans/mt/run.sh
@@ -78,7 +78,7 @@ beam_size=5
 len_penalty=1.0
 if [[ ${use_specific_dict} -eq 1 ]]; then
-    exp_prefix=${specific_prefix}_${exp_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
    data_dir=${data_dir}/${specific_prefix}
    mkdir -p ${data_dir}
 else
@@ -113,7 +113,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
+model_dir=${root_dir}/../checkpoints/${dataset}/mt/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"

--- a/egs/libri_trans/st/run.sh
+++ b/egs/libri_trans/st/run.sh
@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"

--- a/egs/librispeech/asr/conf/ctc.yaml
+++ b/egs/librispeech/asr/conf/ctc.yaml
 ctc-weight: 0.3
+post-process: sentencepiece
\ No newline at end of file
--- a/egs/librispeech/asr/conf/pds_base.yaml
+++ b/egs/librispeech/asr/conf/pds_base.yaml
 arch: pdss2t_transformer_s_8
+#arch: pdss2t_transformer_s_16
+#arch: pdss2t_transformer_s_32
+pds-fusion: True
+ctc-layer: 12
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0

--- a/egs/librispeech/asr/conf/pds_base_16.yaml
+++ b/egs/librispeech/asr/conf/pds_base_16.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_16
 encoder-embed-dim: 256
 pds-stages: 4
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 2_2_6_2
 pds-ratios: 2_2_2_2
 pds-fusion: True

--- a/egs/librispeech/asr/conf/pds_base_32.yaml
+++ b/egs/librispeech/asr/conf/pds_base_32.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_32
 encoder-embed-dim: 256
 pds-stages: 5
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 2_2_3_3_2
 pds-ratios: 2_2_2_2_2
 pds-fusion: True

--- a/egs/librispeech/asr/conf/pds_base_8.yaml
+++ b/egs/librispeech/asr/conf/pds_base_8.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_s_8
 encoder-embed-dim: 256
 pds-stages: 4
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True

--- a/egs/librispeech/asr/conf/pds_big.yaml
+++ b/egs/librispeech/asr/conf/pds_big.yaml
@@ -2,6 +2,9 @@ arch: pdss2t_transformer_m_8
 #arch: pdss2t_transformer_m_16
 #arch: pdss2t_transformer_m_32
+pds-fusion: True
+ctc-layer: 12
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0
@@ -14,7 +17,7 @@ lr: 2e-3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
-dropout: 0.1
+dropout: 0.15
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12

--- a/egs/librispeech/asr/conf/pds_big_16.yaml
+++ b/egs/librispeech/asr/conf/pds_big_16.yaml
@@ -27,7 +27,7 @@ lr: 2e-3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
-dropout: 0.1
+dropout: 0.15
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12

--- a/egs/librispeech/asr/conf/pds_big_32.yaml
+++ b/egs/librispeech/asr/conf/pds_big_32.yaml
@@ -27,7 +27,7 @@ lr: 2e-3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
-dropout: 0.1
+dropout: 0.15
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12

--- a/egs/librispeech/asr/conf/pds_big_8.yaml
+++ b/egs/librispeech/asr/conf/pds_big_8.yaml
@@ -2,7 +2,6 @@ arch: pdss2t_transformer_m_8
 encoder-embed-dim: 512
 pds-stages: 4
-#pds-dropout: 0
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True
@@ -27,7 +26,7 @@ lr: 2e-3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
-dropout: 0.1
+dropout: 0.15
 activation-fn: relu
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12

--- a/egs/librispeech/asr/conf/pds_deep.yaml
+++ b/egs/librispeech/asr/conf/pds_deep.yaml
@@ -2,6 +2,9 @@ arch: pdss2t_transformer_sd_8
 #arch: pdss2t_transformer_sd_16
 #arch: pdss2t_transformer_sd_32
+pds-fusion: True
+ctc-layer: 12
 share-decoder-input-output-embed: True
 optimizer: adam
 clip-norm: 10.0

--- a/egs/librispeech/asr/conf/pds_deep_16.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_16.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_16
 encoder-embed-dim: 256
 pds-stages: 4
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 5_5_12_8
 pds-ratios: 2_2_2_2
 pds-fusion: True

--- a/egs/librispeech/asr/conf/pds_deep_32.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_32.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_32
 encoder-embed-dim: 256
 pds-stages: 5
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 5_5_7_7_6
 pds-ratios: 2_2_2_2_2
 pds-fusion: True

--- a/egs/librispeech/asr/conf/pds_deep_8.yaml
+++ b/egs/librispeech/asr/conf/pds_deep_8.yaml
@@ -2,7 +2,7 @@ arch: pdss2t_transformer_sd_8
 encoder-embed-dim: 256
 pds-stages: 4
-#pds-dropout: 0
+ctc-layer: 12
 pds-layers: 7_7_7_9
 pds-ratios: 2_2_1_2
 pds-fusion: True

--- a/egs/librispeech/asr/run.sh
+++ b/egs/librispeech/asr/run.sh
@@ -44,7 +44,7 @@ specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
 asr_vocab_prefix=spm_unigram10000_st_share
 org_data_dir=${root_dir}/data/${dataset}
-data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/asr
 test_subset=dev-clean,dev-other,test-clean,test-other
 # exp
@@ -87,7 +87,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$code_dir/../checkpoints/$dataset/asr/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -110,7 +110,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
        --vocab-size ${vocab_size}"
    if [[ ${use_specific_dict} -eq 1 ]]; then
-        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}/${lang}
+        cp -r ${specific_dir}/${asr_vocab_prefix}.* ${data_dir}
        cmd="$cmd
        --asr-prefix ${asr_vocab_prefix}"
    fi

--- a/egs/mustc/asr/conf/base.yaml
+++ b/egs/mustc/asr/conf/base.yaml
@@ -26,3 +26,6 @@ decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
 attention-dropout: 0.1
 activation-dropout: 0.1
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/asr/conf/big.yaml
+++ b/egs/mustc/asr/conf/big.yaml
+arch: s2t_transformer_m
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.15
+activation-fn: relu
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/mustc/asr/conf/pds_base.yaml
+++ b/egs/mustc/asr/conf/pds_base.yaml
@@ -24,3 +24,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/asr/conf/pds_base_16.yaml
+++ b/egs/mustc/asr/conf/pds_base_16.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/asr/conf/pds_base_32.yaml
+++ b/egs/mustc/asr/conf/pds_base_32.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/asr/conf/pds_base_8.yaml
+++ b/egs/mustc/asr/conf/pds_base_8.yaml
@@ -37,3 +37,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/mustc/asr/conf/pds_big_8.yaml
+++ b/egs/mustc/asr/conf/pds_big_8.yaml
+arch: pdss2t_transformer_m_8
+encoder-embed-dim: 512
+pds-stages: 4
+ctc-layer: 12
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+dropout: 0.15
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
--- a/egs/mustc/asr/decode.sh
+++ b/egs/mustc/asr/decode.sh
@@ -3,13 +3,14 @@
 gpu_num=1
 data_dir=
-test_subset=(tst-COMMON)
+test_subset=(dev tst-COMMON)
 exp_name=
 if [ "$#" -eq 1 ]; then
    exp_name=$1
 fi
+cer=0
 n_average=10
 beam_size=5
 len_penalty=1.0
@@ -22,6 +23,7 @@ cmd="./run.sh
    --gpu_num ${gpu_num}
    --exp_name ${exp_name}
    --n_average ${n_average}
+    --cer ${cer}
    --beam_size ${beam_size}
    --len_penalty ${len_penalty}
    --max_tokens ${max_tokens}

--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -71,6 +71,7 @@ max_tokens=40000
 step_valid=0
 # decoding setting
+cer=0
 dec_model=checkpoint_best.pt
 n_average=10
 beam_size=5
@@ -106,7 +107,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=${code_dir}/../checkpoints/${dataset}/asr/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/asr/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -339,6 +340,12 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
        --wer-lowercase
        --wer-remove-punct
        "
+        if [[ ${cer} -eq 1 ]]; then
+            cmd="${cmd}
+        --wer-char-level"
+        fi
    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
        if [[ $eval -eq 1 ]]; then
@@ -346,5 +353,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
        fi
 	done
    cat ${result_file}
 fi
--- a/egs/mustc/mt/conf/base.yaml
+++ b/egs/mustc/mt/conf/base.yaml
 arch: transformer
-share-decoder-input-output-embed: True
+share-all-embeddings: True
 optimizer: adam
 clip-norm: 10.0
 lr-scheduler: inverse_sqrt
@@ -27,3 +27,6 @@ encoder-attention-heads: 8
 decoder-embed-dim: 512
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/mt/conf/base_s.yaml
+++ b/egs/mustc/mt/conf/base_s.yaml
 arch: transformer
-share-decoder-input-output-embed: True
+share-all-embeddings: True
 optimizer: adam
 clip-norm: 10.0
 lr-scheduler: inverse_sqrt
@@ -27,3 +27,6 @@ encoder-attention-heads: 4
 decoder-embed-dim: 256
 decoder-ffn-embed-dim: 2048
 decoder-attention-heads: 4
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/mt/run.sh
+++ b/egs/mustc/mt/run.sh
@@ -41,7 +41,7 @@ share_dict=1
 lcrm=0
 tokenizer=0
-use_specific_dict=0
+use_specific_dict=1
 specific_prefix=st
 specific_dir=${root_dir}/data/mustc/st
 src_vocab_prefix=spm_unigram10000_st_share
@@ -78,17 +78,23 @@ beam_size=5
 len_penalty=1.0
 if [[ ${use_specific_dict} -eq 1 ]]; then
-    exp_prefix=${specific_prefix}_${exp_prefix}
+    exp_prefix=${exp_prefix}_${specific_prefix}
    data_dir=${data_dir}/${specific_prefix}
    mkdir -p ${data_dir}
 else
-    data_dir=${data_dir}/${vocab_type}${vocab_size}
+    if [[ "${vocab_type}" == "char" ]]; then
-    src_vocab_prefix=spm_${vocab_type}${vocab_size}_${src_lang}
+        vocab_name=${vocab_type}
-    tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_${tgt_lang}
+        exp_prefix=${exp_prefix}_${vocab_type}
+    else
+        vocab_name=${vocab_type}${vocab_size}
+    fi
+    data_dir=${data_dir}/${vocab_name}
+    src_vocab_prefix=spm_${vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
    if [[ $share_dict -eq 1 ]]; then
        data_dir=${data_dir}_share
-        src_vocab_prefix=spm_${vocab_type}${vocab_size}_share
+        src_vocab_prefix=spm_${vocab_name}_share
-        tgt_vocab_prefix=spm_${vocab_type}${vocab_size}_share
+        tgt_vocab_prefix=spm_${vocab_name}_share
    fi
 fi
 if [[ ${lcrm} -eq 1 ]]; then
@@ -113,7 +119,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$root_dir/../checkpoints/$dataset/mt/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"
@@ -152,7 +158,11 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    mkdir -p ${data_dir}/data
    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
    {
-        txt_dir=${org_data_dir}/data/${split}/txt
+        if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
+            txt_dir=${org_data_dir}/data/${split}/txt
+        else
+            txt_dir=${org_data_dir}/data/${split}
+        fi
        cmd="cat ${txt_dir}/${split}.${src_lang}"
        if [[ ${lcrm} -eq 1 ]]; then
            cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
@@ -264,13 +274,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    if [[ $step_valid -eq 1 ]]; then
        validate_interval=1
        save_interval=1
-        keep_last_epochs=10
        no_epoch_checkpoints=0
        save_interval_updates=500
        keep_interval_updates=10
-    else
-        validate_interval=1
-        keep_last_epochs=10
    fi
    if [[ $bleu_valid -eq 1 ]]; then
        cmd="$cmd
@@ -293,10 +299,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        cmd="${cmd}
        --save-interval $save_interval "
    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
    if [[ -n $save_interval_updates ]]; then
        cmd="${cmd}
        --save-interval-updates $save_interval_updates"
@@ -374,7 +376,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
            cmd="${cmd}
        --scoring sacrebleu"
            if [[ ${tokenizer} -eq 1 ]]; then
-            cmd="${cmd}
+                cmd="${cmd}
        --tokenizer moses
        --moses-source-lang ${src_lang}
        --moses-target-lang ${tgt_lang}"

--- a/egs/mustc/st/conf/sate.yaml
+++ b/egs/mustc/st/conf/sate.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 4
+decoder-embed-dim: 256
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 4
+acoustic-encoder: transformer
+adapter: league
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/sate_big.yaml
+++ b/egs/mustc/st/conf/sate_big.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 1e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.15
+activation-fn: relu
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+acoustic-encoder: transformer
+adapter: league
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/sate_big_pds.yaml
+++ b/egs/mustc/st/conf/sate_big_pds.yaml
+arch: s2t_sate
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 1e-3
+#adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy_with_ctc
+label_smoothing: 0.1
+encoder-normalize-before: True
+decoder-normalize-before: True
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.15
+activation-fn: relu
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+text-encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+acoustic-encoder: pds
+adapter: league
+encoder-embed-dim: 512
+ctc-layer: 12
+pds-stages: 4
+pds-layers: 3_3_3_3
+pds-ratios: 2_2_1_2
+pds-fusion: True
+pds-fusion-method: all_conv
+pds-embed-dims: 512_512_512_512
+pds-ds-method: conv
+pds-embed-norm: True
+pds-position-embed: 1_1_1_1
+pds-kernel-sizes: 5_5_5_5
+pds-ffn-ratios: 4_4_4_4
+pds-attn-heads: 8_8_8_8
+#load-pretrained-encoder-from:
+#load-pretrained-acoustic-encoder-from:
+#load-pretrained-text-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/conf/sate_ctc.yaml
+++ b/egs/mustc/st/conf/sate_ctc.yaml
@@ -8,7 +8,6 @@ warmup-updates: 10000
 lr: 2e-3
 #adam_betas: (0.9,0.98)
-ctc-weight: 0.3
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1
@@ -18,29 +17,22 @@ conv-kernel-sizes: 5,5
 conv-channels: 1024
 dropout: 0.1
 activation-fn: relu
-encoder-embed-dim: 256
 encoder-ffn-embed-dim: 2048
 encoder-layers: 12
 text-encoder-layers: 6
 decoder-layers: 6
 encoder-attention-heads: 4
-#load-pretrained-encoder-from:
+decoder-embed-dim: 256
-#load-pretrained-acoustic-encoder-from:
+decoder-ffn-embed-dim: 2048
-#load-pretrained-text-encoder-from:
+decoder-attention-heads: 4
-#load-pretrained-decoder-from:
-#macaron-style: True
-#use-cnn-module: True
-#cnn-module-kernel: 31
-#acoustic-encoder: pds
+acoustic-encoder: pds
-acoustic-encoder: transformer
 adapter: league
 encoder-embed-dim: 256
+ctc-layer: 12
 pds-stages: 4
-#pds-dropout: 0
 pds-layers: 3_3_3_3
 pds-ratios: 2_2_1_2
 pds-fusion: True
@@ -53,6 +45,7 @@ pds-kernel-sizes: 5_5_5_5
 pds-ffn-ratios: 8_8_8_8
 pds-attn-heads: 4_4_4_4
-decoder-embed-dim: 256
+#load-pretrained-encoder-from:
-decoder-ffn-embed-dim: 2048
+#load-pretrained-acoustic-encoder-from:
-decoder-attention-heads: 4
+#load-pretrained-text-encoder-from:
\ No newline at end of file
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/mustc/st/run.sh
+++ b/egs/mustc/st/run.sh
@@ -115,7 +115,7 @@ if [[ -z ${exp_name} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
 fi
-model_dir=$code_dir/../checkpoints/$dataset/st/${exp_name}
+model_dir=${root_dir}/checkpoints/${dataset}/st/${exp_name}
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
    echo "stage -1: Data Download"

--- a/egs/wav2vec/conf/basis.yaml
+++ b/egs/wav2vec/conf/basis.yaml
+max-epoch: 100
+max-update: 400000
+best-checkpoint-metric: loss
+maximize-best-checkpoint-metric: False
+save-interval: 1
+no-epoch-checkpoints: True
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
--- a/egs/wav2vec/conf/vq_wav2vec.yaml
+++ b/egs/wav2vec/conf/vq_wav2vec.yaml
+arch: wav2vec
+min-lr: 1e-06
+stop-min-lr: 1e-09
+optimizer: adam
+lr: 0.005
+lr-scheduler: cosine
+warmup-updates: 1000
+warmup-init-lr: 1e-07
+criterion: wav2vec
+num-negatives: 10
+cross-sample-negatives: 0
+max-sample-size: 150000
+max-tokens: 300000
+update-freq: 1
+conv-feature-layers: (512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)
+conv-aggregator-layers: (512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)
+skip-connections-agg: True
+residual-scale: 0.5
+log-compression: True
+activation: gelu
+offset: auto
+log-keys: "prob_perplexity","code_perplexity","temp"
+vq-type: gumbel
+#vq-type: kmeans
+#loss-weights: 1
+vq-groups: 2
+vq-depth: 2
+combine-groups: True
+vq-vars: 320
+vq-temp: (2,0.5,0.999995)
+prediction-steps: 12
--- a/egs/wav2vec/conf/wav2vec.yaml
+++ b/egs/wav2vec/conf/wav2vec.yaml
+arch: wav2vec
+min-lr: 1e-06
+stop-min-lr: 1e-09
+optimizer: adam
+lr: 0.005
+lr-scheduler: cosine
+warmup-updates: 500
+warmup-init-lr: 1e-07
+criterion: wav2vec
+num-negatives: 10
+conv-feature-layers: (512, 10, 5), (512, 8, 4), (512, 4, 2), (512, 4, 2), (512, 4, 2), (512, 1, 1), (512, 1, 1)
+conv-aggregator-layers: (512, 2, 1), (512, 3, 1), (512, 4, 1), (512, 5, 1), (512, 6, 1), (512, 7, 1), (512, 8, 1), (512, 9, 1), (512, 10, 1), (512, 11, 1), (512, 12, 1), (512, 13, 1)
+skip-connections-agg: True
+residual-scale: 0.5
+log-compression: True
+max-sample-size: 150000
+max-tokens: 1500000
\ No newline at end of file
--- a/egs/wav2vec/conf/wav2vec2.yaml
+++ b/egs/wav2vec/conf/wav2vec2.yaml
+# @package _group_
+common:
+  fp16: true
+  log_format: json
+  log_interval: 200
+checkpoint:
+  save_interval_updates: 25000
+  keep_interval_updates: 1
+  no_epoch_checkpoints: true
+task:
+  _name: audio_pretraining
+  data: ???
+  max_sample_size: 250000
+  min_sample_size: 32000
+  normalize: false
+dataset:
+  num_workers: 6
+  max_tokens: 1400000
+  skip_invalid_size_inputs_valid_test: true
+distributed_training:
+  distributed_world_size: 64
+  ddp_backend: legacy_ddp
+criterion:
+  _name: wav2vec
+  infonce: true
+  log_keys: ["prob_perplexity","code_perplexity","temp"]
+  loss_weights: [0.1, 10]
+optimization:
+  max_update: 400000
+  lr: [0.0005]
+optimizer:
+  _name: adam
+  adam_betas: (0.9,0.98)
+  adam_eps: 1e-06
+  weight_decay: 0.01
+lr_scheduler:
+  _name: polynomial_decay
+  warmup_updates: 32000
+model:
+  _name: wav2vec2
+  quantize_targets: true
+  final_dim: 256
+  encoder_layerdrop: 0.05
+  dropout_input: 0.1
+  dropout_features: 0.1
+  feature_grad_mult: 0.1
+  encoder_embed_dim: 768
--- a/egs/wav2vec/decode.sh
+++ b/egs/wav2vec/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(dev-clean dev-other test-clean test-other)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+n_average=10
+beam_size=5
+len_penalty=1.0
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ ${#test_subset[@]} -ne 0 ]]; then
+    subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
+    cmd="$cmd --test_subset ${subsets}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/wav2vec/local/monitor.sh
+++ b/egs/wav2vec/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/wav2vec/local/parse_options.sh
+++ b/egs/wav2vec/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/wav2vec/local/utils.sh
+++ b/egs/wav2vec/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/wav2vec/run.sh
+++ b/egs/wav2vec/run.sh
+#! /bin/bash
+# Pre-training wav2vec systems based on the LibriSpeech Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d_%H%M")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+lang=${src_lang}
+dataset=librispeech
+task=audio_pretraining
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/wav2vec
+test_subset=dev-clean,dev-other,test-clean,test-other
+# exp
+exp_prefix=$(date "+%m%d")
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=ctc
+data_config=config.yaml
+# training setting
+fp16=1
+max_tokens=40000
+step_valid=0
+# decoding setting
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+. ./local/parse_options.sh || exit 1;
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/wav2vec/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    ### But you can utilize fairseq recipes in most cases.
+    echo "stage 0: Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+	cmd="python ${code_dir}/examples/wav2vec/wav2vec_manifest.py 
+		${org_data_dir}/LibriSpeech
+		--dest ${data_dir}
+		--ext flac"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Audio Pre-training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+	fi
+fi
+wait
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: ASR Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=(${test_subset//,/ })
+	for subset in ${test_subset[@]}; do
+        subset=${subset}
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --config-yaml ${data_config}
+        --gen-subset ${subset}
+        --task speech_to_text
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}
+        --scoring wer"
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/wav2vec/train.sh
+++ b/egs/wav2vec/train.sh
+#! /bin/bash
+# training the model
+gpu_num=1
+update_freq=1
+max_tokens=1500000
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+#exp_tag=
+config_list=(wav2vec)
+# exp full name
+exp_name=
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/egs/wmt16/mt/binary.sh
+++ b/egs/wmt16/mt/binary.sh
+set -e
+eval=1
+lcrm=0
+root_dir=~/st/Fairseq-S2T
+data_dir=/home/xuchen/st/data/wmt/test
+vocab_dir=/home/xuchen/st/data/wmt/mt/en-de/unigram32000_share
+src_vocab_prefix=spm_unigram32000_share
+tgt_vocab_prefix=spm_unigram32000_share
+src_lang=en
+tgt_lang=de
+tokenize=1
+splits=(newstest2014 newstest2016)
+for split in ${splits[@]}; do
+    src_file=${data_dir}/${split}.${src_lang}
+    tgt_file=${data_dir}/${split}.${tgt_lang}
+    if [[ ${tokenize} -eq 1 ]]; then
+        cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_file}.tok"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        src_file=${src_file}.tok
+        tgt_file=${tgt_file}.tok
+    fi
+    cmd="cat ${src_file}"
+    if [[ ${lcrm} -eq 1 ]]; then
+        cmd="python local/lower_rm.py ${src_file}"
+    fi
+    cmd="${cmd}
+    | spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
+    --output_format=piece
+    > ${src_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="spm_encode
+    --model ${vocab_dir}/${tgt_vocab_prefix}.model
+    --output_format=piece
+    < ${tgt_file}
+    > ${tgt_file}.spm"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    src_file=${src_file}.spm
+    tgt_file=${tgt_file}.spm
+    mkdir -p ${data_dir}/final
+    cmd="cp ${src_file} ${data_dir}/final/${split}.${src_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+    cmd="cp ${tgt_file} ${data_dir}/final/${split}.${tgt_lang}"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+done
+n_set=${#splits[*]}
+for ((i=0;i<$n_set;i++)); do
+    dataset[$i]=${data_dir}/final/${splits[$i]}
+done
+pref=`echo ${dataset[*]} | sed 's/ /,/g'`
+cmd="python ${root_dir}/fairseq_cli/preprocess.py
+    --source-lang ${src_lang}
+    --target-lang ${tgt_lang}
+    --testpref ${pref}
+    --destdir ${data_dir}/data-bin
+    --srcdict ${vocab_dir}/${src_vocab_prefix}.txt
+    --tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
+    --workers 64"
+echo -e "\033[34mRun command: \n${cmd} \033[0m"
+[[ $eval -eq 1 ]] && eval ${cmd}
\ No newline at end of file
--- a/egs/iwslt14/mt/conf/base_iwslt.yaml
+++ b/egs/iwslt14/mt/conf/base_iwslt.yaml
 arch: transformer
-share-decoder-input-output-embed: True
+share-all-embeddings: True
 optimizer: adam
 clip-norm: 10.0
 lr-scheduler: inverse_sqrt
@@ -11,22 +11,22 @@ adam_betas: (0.9,0.997)
 criterion: label_smoothed_cross_entropy
 label_smoothing: 0.1
-dropout: 0.3
+dropout: 0.1
-attention-dropout: 0.0
+attention-dropout: 0.1
-activation-dropout: 0.0
+activation-dropout: 0.1
 activation-fn: relu
 encoder-normalize-before: True
 decoder-normalize-before: True
 encoder-embed-dim: 512
-encoder-ffn-embed-dim: 1024
+encoder-ffn-embed-dim: 2048
 encoder-layers: 6
 decoder-layers: 6
-encoder-attention-heads: 4
+encoder-attention-heads: 8
 decoder-embed-dim: 512
-decoder-ffn-embed-dim: 1024
+decoder-ffn-embed-dim: 2048
-decoder-attention-heads: 4
+decoder-attention-heads: 8
-load-pretrained-encoder-from:
+#load-pretrained-encoder-from:
-load-pretrained-decoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt16/mt/conf/base_postnorm.yaml
+++ b/egs/wmt16/mt/conf/base_postnorm.yaml
+arch: transformer
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 4000
+lr: 7e-4
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt16/mt/conf/basis.yaml
+++ b/egs/wmt16/mt/conf/basis.yaml
+train-subset: train
+valid-subset: valid
+max-epoch: 20
+max-update: 100000
+patience: 5
+best_checkpoint_metric: loss
+maximize_best_checkpoint_metric: False
+no-epoch-checkpoints: True
+#keep-last-epochs: 10
+keep-best-checkpoints: 5
+num-workers: 8
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
--- a/egs/wmt16/mt/conf/big.yaml
+++ b/egs/wmt16/mt/conf/big.yaml
+arch: transformer_wmt_en_de_big_t2t
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 8000
+lr: 7e-4
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.3
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 1024
+encoder-ffn-embed-dim: 4096
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 16
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt16/mt/conf/big_postnorm.yaml
+++ b/egs/wmt16/mt/conf/big_postnorm.yaml
+arch: transformer_wmt_en_de_big
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 4000
+lr: 5e-4
+adam_betas: (0.9,0.98)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.3
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: False
+decoder-normalize-before: False
+encoder-embed-dim: 1024
+encoder-ffn-embed-dim: 4096
+encoder-layers: 6
+decoder-layers: 6
+encoder-attention-heads: 16
+decoder-embed-dim: 1024
+decoder-ffn-embed-dim: 4096
+decoder-attention-heads: 16
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt16/mt/conf/deep.yaml
+++ b/egs/wmt16/mt/conf/deep.yaml
+arch: transformer
+share-all-embeddings: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 16000
+lr: 2e-3
+adam_betas: (0.9,0.997)
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+dropout: 0.1
+attention-dropout: 0.1
+activation-dropout: 0.1
+activation-fn: relu
+encoder-normalize-before: True
+decoder-normalize-before: True
+encoder-embed-dim: 512
+encoder-ffn-embed-dim: 2048
+encoder-layers: 30
+decoder-layers: 6
+encoder-attention-heads: 8
+decoder-embed-dim: 512
+decoder-ffn-embed-dim: 2048
+decoder-attention-heads: 8
+#load-pretrained-encoder-from:
+#load-pretrained-decoder-from:
\ No newline at end of file
--- a/egs/wmt16/mt/conf/dlcl.yaml
+++ b/egs/wmt16/mt/conf/dlcl.yaml
+use-enc-dlcl: True
+use-dec-dlcl: True
--- a/egs/wmt16/mt/conf/rpr.yaml
+++ b/egs/wmt16/mt/conf/rpr.yaml
+#encoder-attention-type: rel_selfattn
+encoder-attention-type: relative
+decoder-attention-type: relative
+max-encoder-relative-length: 20
+max-decoder-relative-length: 20
--- a/egs/wmt16/mt/decode.sh
+++ b/egs/wmt16/mt/decode.sh
+#! /bin/bash
+gpu_num=1
+data_dir=
+test_subset=(test)
+exp_name=
+if [ "$#" -eq 1 ]; then
+    exp_name=$1
+fi
+sacrebleu=0
+n_average=5
+beam_size=4
+len_penalty=0.6
+max_tokens=80000
+dec_model=checkpoint_best.pt
+cmd="./run.sh
+    --stage 2
+    --stop_stage 2
+    --gpu_num ${gpu_num}
+    --exp_name ${exp_name}
+    --sacrebleu ${sacrebleu}
+    --n_average ${n_average}
+    --beam_size ${beam_size}
+    --len_penalty ${len_penalty}
+    --max_tokens ${max_tokens}
+    --dec_model ${dec_model}
+    "
+if [[ -n ${data_dir} ]]; then
+    cmd="$cmd --data_dir ${data_dir}"
+fi
+if [[ -n ${test_subset} ]]; then
+    test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
+    cmd="$cmd --test_subset ${test_subset}"
+fi
+echo $cmd
+eval $cmd
--- a/egs/wmt16/mt/local/detokenizer.perl
+++ b/egs/wmt16/mt/local/detokenizer.perl
+#!/usr/bin/env perl
+# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use warnings;
+use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $UPPERCASE_SENT = 0;
+my $PENN = 0;
+while (@ARGV) {
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-u$/ && ($UPPERCASE_SENT = 1, next);
+  /^-penn$/ && ($PENN = 1, next);
+}
+if ($HELP) {
+	print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+        print "Options:\n";
+        print "  -u     ... uppercase the first char in the final sentence.\n";
+        print "  -q     ... don't report detokenizer revision.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -penn  ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
+	exit;
+}
+if ($language !~ /^(cs|en|fr|it|fi)$/) {
+  print STDERR "Warning: No built-in rules for language $language.\n"
+}
+if ($PENN && $language ne "en") {
+  print STDERR "Error: -penn option only supported for English text.\n";
+  exit;
+}
+if (!$QUIET) {
+	print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
+	print STDERR "Language: $language\n";
+}
+while(<STDIN>) {
+	if (/^<.+>$/ || /^\s*$/) {
+		#don't try to detokenize XML/HTML tag lines
+		print $_;
+  } elsif ($PENN) {
+    print &detokenize_penn($_);
+  } else {
+		print &detokenize($_);
+	}
+}
+sub ucsecondarg {
+  # uppercase the second argument
+  my $arg1 = shift;
+  my $arg2 = shift;
+  return $arg1.uc($arg2);
+}
+sub deescape {
+  # de-escape special chars
+  my ($text) = @_;
+  $text =~ s/\&bar;/\|/g;   # factor separator (legacy)
+  $text =~ s/\&#124;/\|/g;  # factor separator
+  $text =~ s/\&lt;/\</g;    # xml
+  $text =~ s/\&gt;/\>/g;    # xml
+  $text =~ s/\&bra;/\[/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&ket;/\]/g;   # syntax non-terminal (legacy)
+  $text =~ s/\&quot;/\"/g;  # xml
+  $text =~ s/\&apos;/\'/g;  # xml
+  $text =~ s/\&#91;/\[/g;   # syntax non-terminal
+  $text =~ s/\&#93;/\]/g;   # syntax non-terminal
+  $text =~ s/\&amp;/\&/g;   # escape escape
+  return $text;
+}
+sub detokenize {
+	my($text) = @_;
+	chomp($text);
+	$text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text = &deescape($text);
+	my $word;
+	my $i;
+	my @words = split(/ /,$text);
+	$text = "";
+	my %quoteCount =  ("\'"=>0,"\""=>0);
+	my $prependSpace = " ";
+	for ($i=0;$i<(scalar(@words));$i++) {		
+		if (&startsWithCJKChar($words[$i])) {
+		    if (($i > 0 && &endsWithCJKChar($words[$i-1])) && ($language ne "ko")) {
+			# perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+			$text=$text.$words[$i];
+		    } else {
+			# ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+			$text=$text.$prependSpace.$words[$i];
+		    }
+		    $prependSpace = " ";
+		} elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+			#perform right shift on currency and other random punctuation items
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+		    if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
+			#these punctuations are prefixed with a non-breakable space in french
+			$text .= " "; }
+			#perform left shift on punctuation items
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+			#left-shift the contraction for English
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		} elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+			#left-shift floats in Czech
+			$text=$text.$words[$i];
+			$prependSpace = " ";
+		}  elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+			#right-shift the contraction for French and Italian
+			$text = $text.$prependSpace.$words[$i];
+			$prependSpace = "";
+		} elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+				&& ($words[$i] =~ /[\p{IsAlpha}]$/)
+				&& ($words[$i+1] =~ /^[-–]$/)
+				&& ($words[$i+2] =~ /^li$|^mail.*/i)
+				) {
+			#right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+			$text = $text.$prependSpace.$words[$i].$words[$i+1];
+			$i++; # advance over the dash
+			$prependSpace = "";
+		} elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
+			#combine punctuation smartly
+                        my $normalized_quo = $words[$i];
+                        $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+                        $quoteCount{$normalized_quo} = 0
+                                if !defined $quoteCount{$normalized_quo};
+                        if ($language eq "cs" && $words[$i] eq "„") {
+                          # this is always the starting quote in Czech
+                          $quoteCount{$normalized_quo} = 0;
+                        }
+                        if ($language eq "cs" && $words[$i] eq "“") {
+                          # this is usually the ending quote in Czech
+                          $quoteCount{$normalized_quo} = 1;
+                        }
+			if (($quoteCount{$normalized_quo} % 2) eq 0) {
+				if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+					#single quote for posesssives ending in s... "The Jones' house"
+					#left shift
+					$text=$text.$words[$i];
+					$prependSpace = " ";
+				} else {
+					#right shift
+					$text = $text.$prependSpace.$words[$i];
+					$prependSpace = "";
+					$quoteCount{$normalized_quo} ++;
+				}
+			} else {
+				#left shift
+				$text=$text.$words[$i];
+				$prependSpace = " ";
+				$quoteCount{$normalized_quo} ++;
+			}
+        } elsif (($language eq "fi") && ($words[$i-1] =~ /:$/) && ($words[$i] =~ /^(N|n|A|a|Ä|ä|ssa|Ssa|ssä|Ssä|sta|stä|Sta|Stä|hun|Hun|hyn|Hyn|han|Han|hän|Hän|hön|Hön|un|Un|yn|Yn|an|An|än|Än|ön|Ön|seen|Seen|lla|Lla|llä|Llä|lta|Lta|ltä|Ltä|lle|Lle|ksi|Ksi|kse|Kse|tta|Tta|ine|Ine)(ni|si|mme|nne|nsa)?(ko|kö|han|hän|pa|pä|kaan|kään|kin)?$/)) {
+            # Finnish : without intervening space if followed by case suffix
+            # EU:N EU:n EU:ssa EU:sta EU:hun EU:iin ...
+            $text=$text. lc $words[$i];
+            $prependSpace = " ";
+		} else {
+			$text=$text.$prependSpace.$words[$i];
+			$prependSpace = " ";
+		}
+	}
+	# clean up spaces at head and tail of each line as well as any double-spacing
+	$text =~ s/ +/ /g;
+	$text =~ s/\n /\n/g;
+	$text =~ s/ \n/\n/g;
+	$text =~ s/^ //g;
+	$text =~ s/ $//g;
+	#add trailing break
+	$text .= "\n" unless $text =~ /\n$/;
+        $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+	return $text;
+}
+sub detokenize_penn {
+  my($text) = @_;
+  chomp($text);
+  $text = " $text ";
+  $text =~ s/ \@\-\@ /-/g;
+  $text =~ s/ \@\/\@ /\//g;
+  $text = &deescape($text);
+  # merge de-contracted forms except where the second word begins with an
+  # apostrophe (those are handled later)
+  $text =~ s/ n't /n't /g;
+  $text =~ s/ N'T /N'T /g;
+  $text =~ s/ ([Cc])an not / $1annot /g;
+  $text =~ s/ ([Dd])' ye / $1'ye /g;
+  $text =~ s/ ([Gg])im me / $1imme /g;
+  $text =~ s/ ([Gg])on na / $1onna /g;
+  $text =~ s/ ([Gg])ot ta / $1otta /g;
+  $text =~ s/ ([Ll])em me / $1emme /g;
+  $text =~ s/ '([Tt]) is / '$1is /g;
+  $text =~ s/ '([Tt]) was / '$1was /g;
+  $text =~ s/ ([Ww])an na / $1anna /g;
+  # restore brackets
+  $text =~ s/-LRB-/\(/g;
+  $text =~ s/-RRB-/\)/g;
+  $text =~ s/-LSB-/\[/g;
+  $text =~ s/-RSB-/\]/g;
+  $text =~ s/-LCB-/{/g;
+  $text =~ s/-RCB-/}/g;
+  my $i;
+  my @words = split(/ /,$text);
+  $text = "";
+  my $prependSpace = " ";
+  for ($i=0;$i<(scalar(@words));$i++) {
+    if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+      # perform right shift on currency and other random punctuation items
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = "";
+    } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+      # perform left shift on punctuation items
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+      # left-shift the contraction
+      $text=$text.$words[$i];
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
+      # opening single quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\'";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "``") {
+      # opening double quote: convert to straight quote and right-shift
+      $text = $text.$prependSpace."\"";
+      $prependSpace = "";
+    } elsif ($words[$i] eq "\'") {
+      # closing single quote: convert to straight quote and left shift
+      $text = $text."\'";
+      $prependSpace = " ";
+    } elsif ($words[$i] eq "\'\'") {
+      # closing double quote: convert to straight quote and left shift
+      $text = $text."\"";
+      $prependSpace = " ";
+    } else {
+      $text = $text.$prependSpace.$words[$i];
+      $prependSpace = " ";
+    }
+  }
+  # clean up spaces at head and tail of each line as well as any double-spacing
+  $text =~ s/ +/ /g;
+  $text =~ s/\n /\n/g;
+  $text =~ s/ \n/\n/g;
+  $text =~ s/^ //g;
+  $text =~ s/ $//g;
+  # add trailing break
+  $text .= "\n" unless $text =~ /\n$/;
+  $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+  return $text;
+}
+sub startsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $firstChar = substr($str, 0, 1);
+    return &charIsCJK($firstChar);
+}
+sub endsWithCJKChar {
+    my ($str) = @_;
+    return 0 if length($str) == 0;
+    my $lastChar = substr($str, length($str)-1, 1);
+    return &charIsCJK($lastChar);
+}
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+    my ($char) = @_;
+    # $char should be a string of length 1
+    my $codepoint = &codepoint_dec($char);
+    # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+    # Hangul Jamo (1100–11FF)
+    return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+    # CJK Radicals Supplement (2E80–2EFF)
+    # Kangxi Radicals (2F00–2FDF)
+    # Ideographic Description Characters (2FF0–2FFF)
+    # CJK Symbols and Punctuation (3000–303F)
+    # Hiragana (3040–309F)
+    # Katakana (30A0–30FF)
+    # Bopomofo (3100–312F)
+    # Hangul Compatibility Jamo (3130–318F)
+    # Kanbun (3190–319F)
+    # Bopomofo Extended (31A0–31BF)
+    # CJK Strokes (31C0–31EF)
+    # Katakana Phonetic Extensions (31F0–31FF)
+    # Enclosed CJK Letters and Months (3200–32FF)
+    # CJK Compatibility (3300–33FF)
+    # CJK Unified Ideographs Extension A (3400–4DBF)
+    # Yijing Hexagram Symbols (4DC0–4DFF)
+    # CJK Unified Ideographs (4E00–9FFF)
+    # Yi Syllables (A000–A48F)
+    # Yi Radicals (A490–A4CF)
+    return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+    # Phags-pa (A840–A87F)
+    return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+    # Hangul Syllables (AC00–D7AF)
+    return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+    # CJK Compatibility Ideographs (F900–FAFF)
+    return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+    # CJK Compatibility Forms (FE30–FE4F)
+    return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+    # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+    return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+    # Supplementary Ideographic Plane 20000–2FFFF
+    return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+    return 0;
+}
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+    if (my $char = shift) {
+	return unpack('U0U*', $char);
+    }
+}
+sub between_hexes {
+    my ($num, $left, $right) = @_;
+    return $num >= hex($left) && $num <= hex($right);
+}
--- a/egs/wmt16/mt/local/lower_rm.py
+++ b/egs/wmt16/mt/local/lower_rm.py
+import sys
+import string
+in_file = sys.argv[1]
+with open(in_file, "r", encoding="utf-8") as f:
+    for line in f.readlines():
+        line = line.strip().lower()
+        for w in string.punctuation:
+            line = line.replace(w, "")
+        line = line.replace("  ", "")
+        print(line)
--- a/egs/wmt16/mt/local/monitor.sh
+++ b/egs/wmt16/mt/local/monitor.sh
+gpu_num=4
+cmd="sh train.sh"
+while :
+do
+    record=$(mktemp -t temp.record.XXXXXX)
+    gpustat > $record
+    all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+    count=0
+    for dev in ${all_devices[@]}
+    do
+        line=$((dev + 2))
+        use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+        if [[ $use -lt 100 ]]; then
+            device[$count]=$dev
+            count=$((count + 1))
+            if [[ $count -eq $gpu_num ]]; then
+                break
+            fi
+        fi
+    done
+    if [[ ${#device[@]} -lt $gpu_num ]]; then
+        sleep 60s
+    else
+        echo "Run $cmd"
+        eval $cmd
+        sleep 10s
+        exit
+    fi
+done
--- a/egs/wmt16/mt/local/multi-bleu.perl
+++ b/egs/wmt16/mt/local/multi-bleu.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+# $Id$
+use warnings;
+use strict;
+my $lowercase = 0;
+if ($ARGV[0] eq "-lc") {
+  $lowercase = 1;
+  shift;
+}
+my $stem = $ARGV[0];
+if (!defined $stem) {
+  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
+  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
+  exit(1);
+}
+$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
+my @REF;
+my $ref=0;
+while(-e "$stem$ref") {
+    &add_to_ref("$stem$ref",\@REF);
+    $ref++;
+}
+&add_to_ref($stem,\@REF) if -e $stem;
+die("ERROR: could not find reference file $stem") unless scalar @REF;
+# add additional references explicitly specified on the command line
+shift;
+foreach my $stem (@ARGV) {
+    &add_to_ref($stem,\@REF) if -e $stem;
+}
+sub add_to_ref {
+    my ($file,$REF) = @_;
+    my $s=0;
+    if ($file =~ /.gz$/) {
+	open(REF,"gzip -dc $file|") or die "Can't read $file";
+    } else { 
+	open(REF,$file) or die "Can't read $file";
+    }
+    while(<REF>) {
+	chop;
+	push @{$$REF[$s++]}, $_;
+    }
+    close(REF);
+}
+my(@CORRECT,@TOTAL,$length_translation,$length_reference);
+my $s=0;
+while(<STDIN>) {
+    chop;
+    $_ = lc if $lowercase;
+    my @WORD = split;
+    my %REF_NGRAM = ();
+    my $length_translation_this_sentence = scalar(@WORD);
+    my ($closest_diff,$closest_length) = (9999,9999);
+    foreach my $reference (@{$REF[$s]}) {
+#      print "$s $_ <=> $reference\n";
+  $reference = lc($reference) if $lowercase;
+	my @WORD = split(' ',$reference);
+	my $length = scalar(@WORD);
+        my $diff = abs($length_translation_this_sentence-$length);
+	if ($diff < $closest_diff) {
+	    $closest_diff = $diff;
+	    $closest_length = $length;
+	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
+	} elsif ($diff == $closest_diff) {
+            $closest_length = $length if $length < $closest_length;
+            # from two references with the same closeness to me
+            # take the *shorter* into account, not the "first" one.
+        }
+	for(my $n=1;$n<=4;$n++) {
+	    my %REF_NGRAM_N = ();
+	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+		my $ngram = "$n";
+		for(my $w=0;$w<$n;$w++) {
+		    $ngram .= " ".$WORD[$start+$w];
+		}
+		$REF_NGRAM_N{$ngram}++;
+	    }
+	    foreach my $ngram (keys %REF_NGRAM_N) {
+		if (!defined($REF_NGRAM{$ngram}) ||
+		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
+		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
+#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+	}
+    }
+    $length_translation += $length_translation_this_sentence;
+    $length_reference += $closest_length;
+    for(my $n=1;$n<=4;$n++) {
+	my %T_NGRAM = ();
+	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
+	    my $ngram = "$n";
+	    for(my $w=0;$w<$n;$w++) {
+		$ngram .= " ".$WORD[$start+$w];
+	    }
+	    $T_NGRAM{$ngram}++;
+	}
+	foreach my $ngram (keys %T_NGRAM) {
+	    $ngram =~ /^(\d+) /;
+	    my $n = $1;
+            # my $corr = 0;
+#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
+	    $TOTAL[$n] += $T_NGRAM{$ngram};
+	    if (defined($REF_NGRAM{$ngram})) {
+		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
+		    $CORRECT[$n] += $T_NGRAM{$ngram};
+                    # $corr =  $T_NGRAM{$ngram};
+#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
+		}
+		else {
+		    $CORRECT[$n] += $REF_NGRAM{$ngram};
+                    # $corr =  $REF_NGRAM{$ngram};
+#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
+		}
+	    }
+            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
+            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
+	}
+    }
+    $s++;
+}
+my $brevity_penalty = 1;
+my $bleu = 0;
+my @bleu=();
+for(my $n=1;$n<=4;$n++) {
+  if (defined ($TOTAL[$n])){
+    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
+    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
+  }else{
+    $bleu[$n]=0;
+  }
+}
+if ($length_reference==0){
+  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
+  exit(1);
+}
+if ($length_translation<$length_reference) {
+  $brevity_penalty = exp(1-$length_reference/$length_translation);
+}
+$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
+				my_log( $bleu[2] ) +
+				my_log( $bleu[3] ) +
+				my_log( $bleu[4] ) ) / 4) ;
+printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
+    100*$bleu,
+    100*$bleu[1],
+    100*$bleu[2],
+    100*$bleu[3],
+    100*$bleu[4],
+    $brevity_penalty,
+    $length_translation / $length_reference,
+    $length_translation,
+    $length_reference;
+sub my_log {
+  return -9999999999 unless $_[0];
+  return log($_[0]);
+}
--- a/egs/wmt16/mt/local/parse_options.sh
+++ b/egs/wmt16/mt/local/parse_options.sh
+#!/usr/bin/env bash
+# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
+#                 Arnab Ghoshal, Karel Vesely
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# Parse command-line options.
+# To be sourced by another script (as in ". parse_options.sh").
+# Option format is: --option-name arg
+# and shell variable "option_name" gets set to value "arg."
+# The exception is --help, which takes no arguments, but prints the
+# $help_message variable (if defined).
+###
+### The --config file options have lower priority to command line
+### options, so we need to import them first...
+###
+# Now import all the configs specified by command-line, in left-to-right order
+for ((argpos=1; argpos<$#; argpos++)); do
+  if [ "${!argpos}" == "--config" ]; then
+    argpos_plus1=$((argpos+1))
+    config=${!argpos_plus1}
+    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
+    . $config  # source the config file.
+  fi
+done
+###
+### Now we process the command line options
+###
+while true; do
+  [ -z "${1:-}" ] && break;  # break if there are no arguments
+  case "$1" in
+    # If the enclosing script is called with --help option, print the help
+    # message and exit.  Scripts should put help messages in $help_message
+    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
+      else printf "$help_message\n" 1>&2 ; fi;
+      exit 0 ;;
+    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
+      exit 1 ;;
+    # If the first command-line argument begins with "--" (e.g. --foo-bar),
+    # then work out the variable name as $name, which will equal "foo_bar".
+    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
+      # Next we test whether the variable in question is undefned-- if so it's
+      # an invalid option and we die.  Note: $0 evaluates to the name of the
+      # enclosing script.
+      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
+      # is undefined.  We then have to wrap this test inside "eval" because
+      # foo_bar is itself inside a variable ($name).
+      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
+      oldval="`eval echo \\$$name`";
+      # Work out whether we seem to be expecting a Boolean argument.
+      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
+        was_bool=true;
+      else
+        was_bool=false;
+      fi
+      # Set the variable to the right value-- the escaped quotes make it work if
+      # the option had spaces, like --cmd "queue.pl -sync y"
+      eval $name=\"$2\";
+      # Check that Boolean-valued arguments are really Boolean.
+      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
+        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
+        exit 1;
+      fi
+      shift 2;
+      ;;
+  *) break;
+  esac
+done
+# Check for an empty argument to the --cmd option, which can easily occur as a
+# result of scripting errors.
+[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
+true; # so this script returns exit code 0.
--- a/egs/wmt16/mt/local/replace-unicode-punctuation.perl
+++ b/egs/wmt16/mt/local/replace-unicode-punctuation.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+use strict;
+#binmode(STDIN, ":utf8");
+#binmode(STDOUT, ":utf8");
+while(<STDIN>) {
+  s/，/,/g;
+  s/。 */. /g;
+  s/、/,/g;
+  s/”/"/g;
+  s/“/"/g;
+  s/∶/:/g;
+  s/：/:/g;
+  s/？/\?/g;
+  s/《/"/g;
+  s/》/"/g;
+  s/）/\)/g;
+  s/！/\!/g;
+  s/（/\(/g;
+  s/；/;/g;
+  s/１/"/g;
+  s/」/"/g;
+  s/「/"/g;
+  s/０/0/g;
+  s/３/3/g;
+  s/２/2/g;
+  s/５/5/g;
+  s/６/6/g;
+  s/９/9/g;
+  s/７/7/g;
+  s/８/8/g;
+  s/４/4/g;
+  s/． */. /g;
+  s/～/\~/g;
+  s/’/\'/g;
+  s/…/\.\.\./g;
+  s/━/\-/g;
+  s/〈/\</g;
+  s/〉/\>/g;
+  s/【/\[/g;
+  s/】/\]/g;
+  s/％/\%/g;
+  print $_;
+}
--- a/egs/wmt16/mt/local/tokenizer.perl
+++ b/egs/wmt16/mt/local/tokenizer.perl
+#!/usr/bin/env perl
+#
+# This file is part of moses.  Its use is licensed under the GNU Lesser General
+# Public License version 2.1 or, at your option, any later version.
+use warnings;
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use warnings;
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+if  (eval {require Thread;1;}) {
+  #module loaded
+  Thread->import();
+}
+my $mydir = "$RealBin/nonbreaking_prefixes";
+my %NONBREAKING_PREFIX = ();
+my @protected_patterns = ();
+my $protected_patterns_file = "";
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+my $PENN = 0;
+my $NO_ESCAPING = 0;
+while (@ARGV)
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+  # Option to add list of regexps to be protected
+  /^-protected/ && ($protected_patterns_file = shift, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+	/^-penn$/ && ($PENN = 1, next);
+	/^-no-escape/ && ($NO_ESCAPING = 1, next);
+}
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+# print help message
+if ($HELP)
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+        print "  -penn  ... use Penn treebank-like tokenization.\n";
+        print "  -protected FILE  ... specify file with patters to be protected in tokenisation.\n";
+	print "  -no-escape ... don't perform HTML escaping on apostrophy, quotes, etc.\n";
+	exit;
+}
+if (!$QUIET)
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+# Load protected patterns
+if ($protected_patterns_file)
+{
+  open(PP,$protected_patterns_file) || die "Unable to open $protected_patterns_file";
+  while(<PP>) {
+    chomp;
+    push @protected_patterns, $_;
+  }
+}
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>)
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else
+        {
+            print &tokenize($_);
+        }
+    }
+}
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+#####################################################################################
+# subroutines afterward
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array containing a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+    my($text) = @_;
+    if ($PENN) {
+      return tokenize_penn($text);
+    }
+    chomp($text);
+    $text = " $text ";
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+    # Find protected patterns
+    my @protected = ();
+    foreach my $protected_pattern (@protected_patterns) {
+      my $t = $text;
+      while ($t =~ /(?<PATTERN>$protected_pattern)(?<TAIL>.*)$/) {
+        push @protected, $+{PATTERN};
+        $t = $+{TAIL};
+      }
+    }
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s,\Q$protected[$i], $subst ,g;
+    }
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+    # separate out all "other" special characters
+    if (($language eq "fi") or ($language eq "sv")) {
+        # in Finnish and Swedish, the colon can be used inside words as an apostrophe-like character:
+        # USA:n, 20:een, EU:ssa, USA:s, S:t
+        $text =~ s/([^\p{IsAlnum}\s\.\:\'\`\,\-])/ $1 /g;
+        # if a colon is not immediately followed by lower-case characters, separate it out anyway
+        $text =~ s/(:)(?=$|[^\p{Ll}])/ $1 /g;
+    }
+    else {
+        $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+    }
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE)
+    {
+        $text =~ s/([\p{IsAlnum}])\-(?=[\p{IsAlnum}])/$1 \@-\@ /g;
+    }
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./)
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+    # seperate out "," except if within numbers (5,300)
+    #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate out "," except if within numbers (5,300)
+    # previous "global" application skips some:  A,B,C,D,E > A , B,C , D,E
+    # first application uses up B so rule can't see B,C
+    # two-step version here may create extra spaces but these are removed later
+    # will also space digit,letter or letter,digit forms (redundant with next section)
+    $text =~ s/([^\p{IsN}])[,]/$1 , /g;
+    $text =~ s/[,]([^\p{IsN}])/ , $1/g;
+    # separate "," after a number if it's the end of a sentence
+    $text =~ s/([\p{IsN}])[,]$/$1 ,/g;
+    # separate , pre and post number
+    #$text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    #$text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+    # turn `into '
+    #$text =~ s/\`/\'/g;
+    #turn '' into "
+    #$text =~ s/\'\'/ \" /g;
+    if ($language eq "en")
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    }
+    elsif (($language eq "fr") or ($language eq "it") or ($language eq "ga"))
+    {
+        #split contractions left
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    }
+    else
+    {
+        $text =~ s/\'/ \' /g;
+    }
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+			}
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+    # .' at end of sentence is missed
+    $text =~ s/\.\' ?$/ . ' /;
+    # restore protected
+    for (my $i = 0; $i < scalar(@protected); ++$i) {
+      my $subst = sprintf("THISISPROTECTED%.3d", $i);
+      $text =~ s/$subst/$protected[$i]/g;
+    }
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/)
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+    #escape special chars
+    if (!$NO_ESCAPING)
+      {
+	$text =~ s/\&/\&amp;/g;   # escape escape
+	$text =~ s/\|/\&#124;/g;  # factor separator
+	$text =~ s/\</\&lt;/g;    # xml
+	$text =~ s/\>/\&gt;/g;    # xml
+	$text =~ s/\'/\&apos;/g;  # xml
+	$text =~ s/\"/\&quot;/g;  # xml
+	$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+	$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+      }
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+    return $text;
+}
+sub tokenize_penn
+{
+    # Improved compatibility with Penn Treebank tokenization.  Useful if
+    # the text is to later be parsed with a PTB-trained parser.
+    #
+    # Adapted from Robert MacIntyre's sed script:
+    #   http://www.cis.upenn.edu/~treebank/tokenizer.sed
+    my($text) = @_;
+    chomp($text);
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+    # attempt to get correct directional quotes
+    $text =~ s/^``/`` /g;
+    $text =~ s/^"/`` /g;
+    $text =~ s/^`([^`])/` $1/g;
+    $text =~ s/^'/`  /g;
+    $text =~ s/([ ([{<])"/$1 `` /g;
+    $text =~ s/([ ([{<])``/$1 `` /g;
+    $text =~ s/([ ([{<])`([^`])/$1 ` $2/g;
+    $text =~ s/([ ([{<])'/$1 ` /g;
+    # close quotes handled at end
+    $text =~ s=\.\.\.= _ELLIPSIS_ =g;
+    # separate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+    #$text =~ s=([;:@#\$%&\p{IsSc}])= $1 =g;
+$text =~ s=([;:@#\$%&\p{IsSc}\p{IsSo}])= $1 =g;
+    # Separate out intra-token slashes.  PTB tokenization doesn't do this, so
+    # the tokens should be merged prior to parsing with a PTB-trained parser
+    # (see syntax-hyphen-splitting.perl).
+    $text =~ s/([\p{IsAlnum}])\/([\p{IsAlnum}])/$1 \@\/\@ $2/g;
+    # Assume sentence tokenization has been done first, so split FINAL periods
+    # only.
+    $text =~ s=([^.])([.])([\]\)}>"']*) ?$=$1 $2$3 =g;
+    # however, we may as well split ALL question marks and exclamation points,
+    # since they shouldn't have the abbrev.-marker ambiguity problem
+    $text =~ s=([?!])= $1 =g;
+    # parentheses, brackets, etc.
+    $text =~ s=([\]\[\(\){}<>])= $1 =g;
+    $text =~ s/\(/-LRB-/g;
+    $text =~ s/\)/-RRB-/g;
+    $text =~ s/\[/-LSB-/g;
+    $text =~ s/\]/-RSB-/g;
+    $text =~ s/{/-LCB-/g;
+    $text =~ s/}/-RCB-/g;
+    $text =~ s=--= -- =g;
+    # First off, add a space to the beginning and end of each line, to reduce
+    # necessary number of regexps.
+    $text =~ s=$= =;
+    $text =~ s=^= =;
+    $text =~ s="= '' =g;
+    # possessive or close-single-quote
+    $text =~ s=([^'])' =$1 ' =g;
+    # as in it's, I'm, we'd
+    $text =~ s='([sSmMdD]) = '$1 =g;
+    $text =~ s='ll = 'll =g;
+    $text =~ s='re = 're =g;
+    $text =~ s='ve = 've =g;
+    $text =~ s=n't = n't =g;
+    $text =~ s='LL = 'LL =g;
+    $text =~ s='RE = 'RE =g;
+    $text =~ s='VE = 'VE =g;
+    $text =~ s=N'T = N'T =g;
+    $text =~ s= ([Cc])annot = $1an not =g;
+    $text =~ s= ([Dd])'ye = $1' ye =g;
+    $text =~ s= ([Gg])imme = $1im me =g;
+    $text =~ s= ([Gg])onna = $1on na =g;
+    $text =~ s= ([Gg])otta = $1ot ta =g;
+    $text =~ s= ([Ll])emme = $1em me =g;
+    $text =~ s= ([Mm])ore'n = $1ore 'n =g;
+    $text =~ s= '([Tt])is = '$1 is =g;
+    $text =~ s= '([Tt])was = '$1 was =g;
+    $text =~ s= ([Ww])anna = $1an na =g;
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++)
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/)
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+            {
+                #no change
+            }
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+            {
+                #no change
+            }
+            else
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }
+    # restore ellipses
+    $text =~ s=_ELLIPSIS_=\.\.\.=g;
+    # clean out extra spaces
+    $text =~ s=  *= =g;
+    $text =~ s=^ *==g;
+    $text =~ s= *$==g;
+    #escape special chars
+    $text =~ s/\&/\&amp;/g;   # escape escape
+    $text =~ s/\|/\&#124;/g;  # factor separator
+    $text =~ s/\</\&lt;/g;    # xml
+    $text =~ s/\>/\&gt;/g;    # xml
+    $text =~ s/\'/\&apos;/g;  # xml
+    $text =~ s/\"/\&quot;/g;  # xml
+    $text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    $text =~ s/\]/\&#93;/g;   # syntax non-terminal
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+    return $text;
+}
+sub load_prefixes
+{
+    my ($language, $PREFIX_REF) = @_;
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile))
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+    if (-e "$prefixfile")
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>)
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#"))
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+                {
+                    $PREFIX_REF->{$1} = 2;
+                }
+                else
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
--- a/egs/wmt16/mt/local/utils.sh
+++ b/egs/wmt16/mt/local/utils.sh
+get_devices(){
+    gpu_num=$1
+    use_cpu=$2
+    device=()
+    while :
+    do
+        record=$(mktemp -t temp.record.XXXXXX)
+        gpustat > $record
+        all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
+        count=0
+        for dev in ${all_devices[@]}
+        do
+            line=$((dev + 2))
+            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
+            if [[ $use -lt 100 ]]; then
+                device[$count]=$dev
+                count=$((count + 1))
+                if [[ $count -eq $gpu_num ]]; then
+                    break
+                fi
+            fi
+        done
+        if [[ ${#device[@]} -lt $gpu_num ]]; then
+            if [[ $use_cpu -eq 1 ]]; then
+                device=(-1)
+            else
+                sleep 60s
+            fi
+        else
+            break
+        fi
+    done
+    echo ${device[*]} | sed 's/ /,/g'
+    return $?
+}
--- a/egs/wmt16/mt/local/wmt_en2de_multi_bleu.sh
+++ b/egs/wmt16/mt/local/wmt_en2de_multi_bleu.sh
+#! /bin/bash
+# calculate wmt14 en-de multi-bleu score
+if [ $# -ne 1 ]; then
+    echo "usage: $0 GENERATE_PY_OUTPUT"
+    exit 1
+fi
+echo -e "\n RUN >> "$0
+requirement_scripts=(detokenizer.perl replace-unicode-punctuation.perl tokenizer.perl multi-bleu.perl)
+for script in ${requirement_scripts[@]}; do
+    if ! which ${script} > /dev/null; then
+        echo "Error: it seems that moses is not installed or exported int the environment variables." >&2
+        return 1
+    fi
+done
+detokenizer=detokenizer.perl
+replace_unicode_punctuation=replace-unicode-punctuation.perl
+tokenizer=tokenizer.perl
+multi_bleu=multi-bleu.perl
+GEN=$1
+SYS=$GEN.sys
+REF=$GEN.ref
+cat $GEN | cut -f 3 > $REF
+cat $GEN | cut -f 4 > $SYS
+#detokenize the decodes file to format the manner to do tokenize
+perl $detokenizer -l de < $SYS > $SYS.dtk
+perl $detokenizer -l de < $REF > $REF.dtk
+#replace unicode
+perl $replace_unicode_punctuation -l de < $SYS.dtk > $SYS.dtk.punc
+perl $replace_unicode_punctuation -l de < $REF.dtk > $REF.dtk.punc
+#tokenize the decodes file by moses tokenizer.perl
+perl $tokenizer -l de < $SYS.dtk.punc > $SYS.dtk.punc.tok
+perl $tokenizer -l de < $REF.dtk.punc > $REF.dtk.punc.tok
+#"rich-text format" --> rich ##AT##-##AT## text format.
+perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $SYS.dtk.punc.tok > $SYS.dtk.punc.tok.atat
+perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $REF.dtk.punc.tok > $REF.dtk.punc.tok.atat
+perl $multi_bleu $REF.dtk.punc.tok.atat < $SYS.dtk.punc.tok.atat
+rm -f $SYS.dtk $SYS.dtk.punc $SYS.dtk.punc.tok $REF.dtk $REF.dtk.punc $REF.dtk.punc.tok
\ No newline at end of file
--- a/egs/wmt16/mt/run.sh
+++ b/egs/wmt16/mt/run.sh
+#! /bin/bash
+# Processing WMT16 En-De Datasets
+# Copyright 2021 Natural Language Processing Laboratory 
+# Xu Chen (xuchenneu@163.com)
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+#set -u
+set -o pipefail
+export PYTHONIOENCODING=UTF-8
+eval=1
+time=$(date "+%m%d")
+stage=0
+stop_stage=0
+######## hardware ########
+# devices
+device=()
+gpu_num=8
+update_freq=1
+root_dir=~/st
+code_dir=${root_dir}/Fairseq-S2T
+pwd_dir=$PWD
+# dataset
+src_lang=en
+tgt_lang=de
+lang=${src_lang}-${tgt_lang}
+dataset=wmt16.en-de
+task=translation
+vocab_type=unigram
+vocab_size=32000
+share_dict=1
+lcrm=0
+tokenizer=1
+use_specific_dict=1
+subword=1
+specific_prefix=subword32000_share_tok
+specific_dir=${root_dir}/data/mustc/st
+src_vocab_prefix=spm_unigram10000_st_share
+tgt_vocab_prefix=spm_unigram10000_st_share
+org_data_dir=${root_dir}/data/${dataset}
+data_dir=${root_dir}/data/${dataset}/mt
+train_subset=train
+valid_subset=dev
+trans_subset=newstest2014
+test_subset=test
+# exp
+exp_prefix=${time}
+extra_tag=
+extra_parameter=
+exp_tag=baseline
+exp_name=
+# config
+train_config=base_s
+# training setting
+fp16=1
+max_tokens=4096
+step_valid=0
+bleu_valid=0
+# decoding setting
+sacrebleu=0
+dec_model=checkpoint_best.pt
+n_average=10
+beam_size=5
+len_penalty=1.0
+if [[ ${use_specific_dict} -eq 1 ]]; then
+    exp_prefix=${exp_prefix}_${specific_prefix}
+    data_dir=${data_dir}/${specific_prefix}
+    mkdir -p ${data_dir}
+else
+    if [[ "${vocab_type}" == "char" ]]; then
+        vocab_name=${vocab_type}
+        exp_prefix=${exp_prefix}_${vocab_type}
+    else
+        vocab_name=${vocab_type}${vocab_size}
+    fi
+    data_dir=${data_dir}/${vocab_name}
+    src_vocab_prefix=spm_${vocab_name}_${src_lang}
+    tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
+    if [[ $share_dict -eq 1 ]]; then
+        data_dir=${data_dir}_share
+        src_vocab_prefix=spm_${vocab_name}_share
+        tgt_vocab_prefix=spm_${vocab_name}_share
+    fi
+fi
+if [[ ${lcrm} -eq 1 ]]; then
+    data_dir=${data_dir}_lcrm
+    exp_prefix=${exp_prefix}_lcrm
+fi
+if [[ ${tokenizer} -eq 1 ]]; then
+    data_dir=${data_dir}_tok
+    exp_prefix=${exp_prefix}_tok
+fi
+. ./local/parse_options.sh || exit 1;
+# full path
+if [[ -z ${exp_name} ]]; then
+    config_string=${train_config//,/_}
+    exp_name=${exp_prefix}_${config_string}_${exp_tag}
+    if [[ -n ${extra_tag} ]]; then
+        exp_name=${exp_name}_${extra_tag}
+    fi
+fi
+model_dir=${root_dir}/checkpoints/${dataset}/mt/${exp_name}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    # pass
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    ### Task dependent. You have to make data the following preparation part by yourself.
+    echo "stage 0: MT Data Preparation"
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    if [[ ! -f ${data_dir}/${src_vocab_prefix}.txt || ! -f ${data_dir}/${tgt_vocab_prefix}.txt ]]; then
+        if [[ ${use_specific_dict} -eq 0 ]]; then
+            cmd="python ${code_dir}/examples/speech_to_text/prep_mt_data.py
+                --data-root ${org_data_dir}
+                --output-root ${data_dir}
+                --splits ${train_subset},${valid_subset},${trans_subset}
+                --src-lang ${src_lang}
+                --tgt-lang ${tgt_lang}
+                --vocab-type ${vocab_type}
+                --vocab-size ${vocab_size}"
+            if [[ $share_dict -eq 1 ]]; then
+                cmd="$cmd
+                --share"
+            fi
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="$cmd
+                --tokenizer"
+            fi
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval ${cmd}
+        else
+            cp -r ${specific_dir}/${src_vocab_prefix}.* ${data_dir}
+            cp ${specific_dir}/${tgt_vocab_prefix}.* ${data_dir}
+        fi
+    fi
+    mkdir -p ${data_dir}/data
+    for split in ${train_subset} ${valid_subset} ${trans_subset}; do
+    {
+        if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
+            text_dir=${org_data_dir}/data/${split}/txt
+        else
+            text_dir=${org_data_dir}/data/${split}
+        fi
+        src_text=${text_dir}/${split}.${src_lang}
+        tgt_text=${text_dir}/${split}.${tgt_lang}
+        if [[ ${tokenizer} -eq 1 ]]; then
+            src_text=${text_dir}/${split}.tok.${src_lang}
+            tgt_text=${text_dir}/${split}.tok.${tgt_lang}
+        fi
+        cmd="cat ${src_text}"
+        if [[ ${lcrm} -eq 1 ]]; then
+            cmd="python local/lower_rm.py ${src_text}"
+        fi
+        cmd="${cmd}
+        | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
+        --output_format=piece
+        > ${data_dir}/data/${split}.${src_lang}"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+        cmd="spm_encode
+        --model ${data_dir}/${tgt_vocab_prefix}.model
+        --output_format=piece
+        < ${tgt_text}
+        > ${data_dir}/data/${split}.${tgt_lang}"
+        echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        [[ $eval -eq 1 ]] && eval ${cmd}
+    }&
+    done
+    wait
+    cmd="python ${code_dir}/fairseq_cli/preprocess.py
+        --source-lang ${src_lang} --target-lang ${tgt_lang}
+        --trainpref ${data_dir}/data/${train_subset}
+        --validpref ${data_dir}/data/${valid_subset}
+        --testpref ${data_dir}/data/${trans_subset}
+        --destdir ${data_dir}/data-bin
+        --srcdict ${data_dir}/${src_vocab_prefix}.txt
+        --tgtdict ${data_dir}/${tgt_vocab_prefix}.txt
+        --workers 64"
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    [[ $eval -eq 1 ]] && eval ${cmd}
+fi
+data_dir=${data_dir}/data-bin
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: MT Network Training"
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    echo -e "dev=${device} data=${data_dir} model=${model_dir}"
+    if [[ ! -d ${model_dir} ]]; then
+        mkdir -p ${model_dir}
+    else
+        echo "${model_dir} exists."
+    fi
+    cp ${BASH_SOURCE[0]} ${model_dir}
+    cp ${PWD}/train.sh ${model_dir}
+    extra_parameter="${extra_parameter}
+        --train-config ${pwd_dir}/conf/basis.yaml"
+    cp ${pwd_dir}/conf/basis.yaml ${model_dir}
+    config_list="${train_config//,/ }"
+    idx=1
+    for config in ${config_list[@]}
+    do
+        config_path=${pwd_dir}/conf/${config}.yaml
+        if [[ ! -f ${config_path} ]]; then
+            echo "No config file ${config_path}"
+            exit
+        fi
+        cp ${config_path} ${model_dir}
+        extra_parameter="${extra_parameter}
+        --train-config${idx} ${config_path}"
+        idx=$((idx + 1))
+    done
+    cmd="python3 -u ${code_dir}/fairseq_cli/train.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --task ${task}
+        --max-tokens ${max_tokens}
+        --skip-invalid-size-inputs-valid-test
+        --update-freq ${update_freq}
+        --log-interval 100
+        --save-dir ${model_dir}
+        --tensorboard-logdir ${model_dir}"
+	if [[ -n ${extra_parameter} ]]; then
+        cmd="${cmd}
+        ${extra_parameter}"
+    fi
+	if [[ ${gpu_num} -gt 0 ]]; then
+		cmd="${cmd}
+        --distributed-world-size $gpu_num
+        --ddp-backend no_c10d"
+	fi
+    if [[ $fp16 -eq 1 ]]; then
+        cmd="${cmd}
+        --fp16"
+    fi
+    if [[ $step_valid -eq 1 ]]; then
+        validate_interval=1
+        save_interval=1
+        no_epoch_checkpoints=0
+        save_interval_updates=500
+        keep_interval_updates=10
+    fi
+    if [[ $bleu_valid -eq 1 ]]; then
+        cmd="$cmd
+        --eval-bleu
+        --eval-bleu-args '{\"beam\": 1}'
+        --eval-tokenized-bleu
+        --eval-bleu-remove-bpe
+        --best-checkpoint-metric bleu
+        --maximize-best-checkpoint-metric"
+    fi
+    if [[ -n $no_epoch_checkpoints && $no_epoch_checkpoints -eq 1 ]]; then
+        cmd="$cmd
+        --no-epoch-checkpoints"
+    fi
+    if [[ -n $validate_interval ]]; then
+        cmd="${cmd}
+        --validate-interval $validate_interval "
+    fi
+    if [[ -n $save_interval ]]; then
+        cmd="${cmd}
+        --save-interval $save_interval "
+    fi
+    if [[ -n $save_interval_updates ]]; then
+        cmd="${cmd}
+        --save-interval-updates $save_interval_updates"
+        if [[ -n $keep_interval_updates ]]; then
+        cmd="${cmd}
+        --keep-interval-updates $keep_interval_updates"
+        fi
+    fi
+    echo -e "\033[34mRun command: \n${cmd} \033[0m"
+    # save info
+    log=./history.log
+    echo "${time} | ${device} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
+    tail -n 50 ${log} > tmp.log
+    mv tmp.log $log
+    export CUDA_VISIBLE_DEVICES=${device}
+    log=${model_dir}/train.log
+    if [[ -e ${log} ]]; then
+        for i in `seq 1 100`; do
+            if [ ! -e ${log}.${i} ]; then
+                log=${log}.${i}
+                break
+            fi
+        done
+    fi
+    cmd="nohup ${cmd} >> ${log} 2>&1 &"
+    if [[ $eval -eq 1 ]]; then
+		eval $cmd
+		sleep 2s
+		tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
+	fi
+wait
+echo -e " >> finish training \n"
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: MT Decoding"
+    if [[ ${n_average} -ne 1 ]]; then
+        # Average models
+		dec_model=avg_${n_average}_checkpoint.pt
+        if [[ ! -f ${model_dir}/${dec_model} ]]; then
+            cmd="python ${code_dir}/scripts/average_checkpoints.py
+            --inputs ${model_dir}
+            --num-best-checkpoints ${n_average}
+            --output ${model_dir}/${dec_model}"
+            echo -e "\033[34mRun command: \n${cmd} \033[0m"
+            [[ $eval -eq 1 ]] && eval $cmd
+        fi
+	else
+		dec_model=${dec_model}
+	fi
+    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
+		if [[ ${gpu_num} -eq 0 ]]; then
+			device=""
+		else
+        	source ./local/utils.sh
+        	device=$(get_devices $gpu_num 0)
+		fi
+    fi
+    export CUDA_VISIBLE_DEVICES=${device}
+	result_file=${model_dir}/decode_result
+	[[ -f ${result_file} ]] && rm ${result_file}
+    test_subset=(${test_subset//,/ })
+	for subset in ${test_subset[@]}; do
+  		cmd="python ${code_dir}/fairseq_cli/generate.py
+        ${data_dir}
+        --source-lang ${src_lang}
+        --target-lang ${tgt_lang}
+        --gen-subset ${subset}
+        --task ${task}
+        --path ${model_dir}/${dec_model}
+        --results-path ${model_dir}
+        --max-tokens ${max_tokens}
+        --beam ${beam_size}
+        --lenpen ${len_penalty}"
+        if [[ ${subword} -eq 1 ]]; then
+            cmd="${cmd}
+        --post-process subword_nmt"
+        else
+            cmd="${cmd}
+        --post-process sentencepiece"
+        fi
+        if [[ ${sacrebleu} -eq 1 ]]; then
+            cmd="${cmd}
+        --scoring sacrebleu"
+            if [[ ${tokenizer} -eq 1 ]]; then
+                cmd="${cmd}
+        --tokenizer moses
+        --moses-source-lang ${src_lang}
+        --moses-target-lang ${tgt_lang}"
+            fi
+        fi
+    	echo -e "\033[34mRun command: \n${cmd} \033[0m"
+        if [[ $eval -eq 1 ]]; then
+    	    eval $cmd
+    	    tail -n 1 ${model_dir}/generate-${subset}.txt >> ${result_file}
+    	    export PATH=$PATH:${pwd_dir}/local
+    	    sh local/wmt_en2de_multi_bleu.sh ${model_dir}/translation-${subset}.txt
+        fi
+	done
+    cat ${result_file}
+fi
--- a/egs/wmt16/mt/train.sh
+++ b/egs/wmt16/mt/train.sh
+#! /bin/bash
+# training the model
+gpu_num=8
+update_freq=1
+max_tokens=8192
+exp_tag=baseline
+config_list=(base)
+# exp full name
+exp_name=
+extra_tag=
+extra_parameter=
+#extra_tag="${extra_tag}"
+#extra_parameter="${extra_parameter} "
+train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
+cmd="./run.sh
+    --stage 1
+    --stop_stage 1
+    --gpu_num ${gpu_num}
+    --update_freq ${update_freq}
+    --train_config ${train_config}
+    --max_tokens ${max_tokens}
+    "
+if [[ -n ${exp_name} ]]; then
+    cmd="$cmd --exp_name ${exp_name}"
+fi
+if [[ -n ${exp_tag} ]]; then
+    cmd="$cmd --exp_tag ${exp_tag}"
+fi
+if [[ -n ${extra_tag} ]]; then
+    cmd="$cmd --extra_tag ${extra_tag}"
+fi
+if [[ -n ${extra_parameter} ]]; then
+    cmd="$cmd --extra_parameter \"${extra_parameter}\""
+fi
+echo ${cmd}
+eval ${cmd}
--- a/examples/speech_to_text/prep_audio_data.py
+++ b/examples/speech_to_text/prep_audio_data.py
@@ -95,7 +95,7 @@ class AudioDataset(Dataset):
                continue
            txt_path = txt_root / f"{split}.{_lang}"
            if tokenizer:
-                txt_path = txt_root / f"{split}.{_lang}.tok"
+                txt_path = txt_root / f"{split}.tok.{_lang}"
            if Path.exists(txt_path):
                if _lang == src_lang:

--- a/examples/speech_to_text/prep_mt_data.py
+++ b/examples/speech_to_text/prep_mt_data.py
@@ -32,14 +32,18 @@ class MTDataset(Dataset):
    utterance_id
    """
-    def __init__(self, root: str, src_lang, tgt_lang: str, split: str) -> None:
+    def __init__(self, root: str, src_lang, tgt_lang: str, split: str, tokenizer: bool = False) -> None:
        _root = Path(root) / "data" / split
        txt_root = _root / "txt" if (_root / "txt").is_dir() else _root
        assert _root.is_dir() and txt_root.is_dir(), (_root, txt_root)
        # Load source and target text
        self.data = []
        for _lang in [src_lang, tgt_lang]:
-            with open(txt_root / f"{split}.{_lang}") as f:
+            txt_path = txt_root / f"{split}.{_lang}"
+            if tokenizer:
+                txt_path = txt_root / f"{split}.tok.{_lang}"
+            with open(txt_path) as f:
                texts = [r.strip() for r in f]
                self.data.append(texts)
        self.data = list(zip(self.data[0], self.data[1]))
@@ -72,7 +76,7 @@ def process(args):
        is_train_split = split.startswith("train")
        manifest = {c: [] for c in MANIFEST_COLUMNS}
-        dataset = MTDataset(args.data_root, src_lang, tgt_lang, split)
+        dataset = MTDataset(args.data_root, src_lang, tgt_lang, split, args.tokenizer)
        for src_text, tgt_text in tqdm(dataset):
            if args.lowercase_src:
                src_text = src_text.lower()
@@ -165,6 +169,7 @@ def main():
    parser.add_argument("--src-lang", required=True, type=str)
    parser.add_argument("--tgt-lang", required=True, type=str)
    parser.add_argument("--share", action="store_true", help="share the source and target vocabulary")
+    parser.add_argument("--tokenizer", action="store_true", help="use tokenizer txt")
    args = parser.parse_args()
    process(args)

--- a/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
+++ b/fairseq/criterions/label_smoothed_cross_entropy_with_ctc.py
@@ -101,12 +101,6 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
    def compute_ctc_loss(self, model, sample, encoder_out, logging_output):
        transcript = sample["transcript"]
-        ctc_logit = encoder_out["ctc_logit"][0]
-        lprobs = model.get_normalized_probs(
-            [ctc_logit], log_probs=True
-        ).contiguous()  # (T, B, C) from the encoder
-        lprobs.batch_first = False
        if "ctc_padding_mask" in encoder_out:
            non_padding_mask = ~encoder_out["ctc_padding_mask"][0]
        else:
@@ -119,15 +113,27 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
        targets_flat = transcript["tokens"].masked_select(pad_mask)
        transcript_lengths = pad_mask.sum(-1)
-        with torch.backends.cudnn.flags(enabled=False):
+        ctc_loss = 0
-            loss = self.ctc_loss(
+        ctc_num = len(encoder_out["ctc_logit"])
-                lprobs,
+        assert ctc_num != 0, "No ctc logit for loss!"
-                targets_flat,
+        for i in range(ctc_num):
-                input_lengths,
-                transcript_lengths,
+            ctc_logit = encoder_out["ctc_logit"][0]
-            )
+            lprobs = model.get_normalized_probs(
+                [ctc_logit], log_probs=True
-        logging_output["ctc_loss"] = utils.item(loss.data)
+            ).contiguous()  # (T, B, C) from the encoder
+            lprobs.batch_first = False
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = self.ctc_loss(
+                    lprobs,
+                    targets_flat,
+                    input_lengths,
+                    transcript_lengths,
+                )
+            ctc_loss += loss
+        ctc_loss /= ctc_num
+        logging_output["ctc_loss"] = utils.item(ctc_loss.data)
        if not model.training:
            import editdistance
@@ -142,7 +148,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
                wv_errs = 0
                for lp, t, inp_l in zip(
                    lprobs_t,
-                    sample["target_label"] if "target_label" in sample else sample["target"],
+                    sample["transcript"]["tokens"] if "transcript" in sample else sample["target"],
                    input_lengths,
                ):
                    lp = lp[:inp_l].unsqueeze(0)
@@ -183,7 +189,7 @@ class LabelSmoothedCrossEntropyCriterionWithCTC(
                logging_output["c_errors"] = c_err
                logging_output["c_total"] = c_len
-        return loss, logging_output
+        return ctc_loss, logging_output
    @staticmethod
    def reduce_metrics(logging_outputs) -> None:

--- a/fairseq/models/dlcl_transformer.py
+++ b/fairseq/models/dlcl_transformer.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-from typing import Any, Dict, List, Optional, Tuple
-import logging
-import torch
-from fairseq import checkpoint_utils, utils
-from fairseq.models import (
-    register_model,
-    register_model_architecture,
-)
-from fairseq.models.transformer import (
-    TransformerModel,
-    TransformerEncoder,
-    TransformerDecoder
-)
-from fairseq.modules.layer_history import CreateLayerHistory
-from torch import Tensor
-DEFAULT_MAX_SOURCE_POSITIONS = 1024
-DEFAULT_MAX_TARGET_POSITIONS = 1024
-logger = logging.getLogger(__name__)
-@register_model("dlcl_transformer")
-class DLCLTransformerModel(TransformerModel):
-    """
-    Transformer model from `"Attention Is All You Need" (Vaswani, et al, 2017)
-    <https://arxiv.org/abs/1706.03762>`_.
-    Args:
-        encoder (TransformerEncoder): the encoder
-        decoder (TransformerDecoder): the decoder
-    The Transformer model provides the following named architectures and
-    command-line arguments:
-    .. argparse::
-        :ref: fairseq.models.dlcl_transformer_parser
-        :prog:
-    """
-    def __init__(self, args, encoder, decoder):
-        super().__init__(args, encoder, decoder)
-    @staticmethod
-    def add_args(parser):
-        """Add model-specific arguments to the parser."""
-        TransformerModel.add_args(parser)
-        # dense layer parameters
-        # parser.add_argument('--encoder-history-type',
-        #                     default="learnable_dense",
-        #                     help='encoder layer history type')
-        # parser.add_argument('--decoder-history-type',
-        #                     default="learnable_dense",
-        #                     help='decoder layer history type')
-        parser.add_argument('--encoder-integration-type', choices=['avg', 'sum'],
-                            help='encoder layer integration type')
-        parser.add_argument('--decoder-integration-type', choices=['avg', 'sum'],
-                            help='decoder layer integration type')
-    @classmethod
-    def build_encoder(cls, args, src_dict, embed_tokens):
-        encoder = DLCLTransformerEncoder(args, src_dict, embed_tokens)
-        if getattr(args, "load_pretrained_encoder_from", None):
-            logger.info(
-                f"loaded pretrained encoder from: "
-                f"{args.load_pretrained_encoder_from}"
-            )
-            encoder = checkpoint_utils.load_pretrained_component_from_model(
-                component=encoder, checkpoint=args.load_pretrained_encoder_from, strict=False
-            )
-        return encoder
-    @classmethod
-    def build_decoder(cls, args, tgt_dict, embed_tokens):
-        decoder = DLCLTransformerDecoder(
-            args,
-            tgt_dict,
-            embed_tokens,
-            no_encoder_attn=getattr(args, "no_cross_attention", False),
-        )
-        if getattr(args, "load_pretrained_decoder_from", None):
-            logger.info(
-                f"loaded pretrained decoder from: "
-                f"{args.load_pretrained_decoder_from}"
-            )
-            decoder = checkpoint_utils.load_pretrained_component_from_model(
-                component=decoder, checkpoint=args.load_pretrained_decoder_from, strict=False
-            )
-        return decoder
-class DLCLTransformerEncoder(TransformerEncoder):
-    """
-    Transformer encoder consisting of *args.encoder_layers* layers. Each layer
-    is a :class:`TransformerEncoderLayer`.
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): encoding dictionary
-        embed_tokens (torch.nn.Embedding): input embedding
-    """
-    def __init__(self, args, dictionary, embed_tokens):
-        self.args = args
-        super().__init__(args, dictionary, embed_tokens)
-        self.history = CreateLayerHistory(args, is_encoder=True)
-    def forward(
-        self,
-        src_tokens,
-        src_lengths: Optional[torch.Tensor] = None,
-        return_all_hiddens: bool = False,
-        token_embeddings: Optional[torch.Tensor] = None,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (torch.LongTensor): lengths of each source sentence of
-                shape `(batch)`
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-            token_embeddings (torch.Tensor, optional): precomputed embeddings
-                default `None` will recompute embeddings
-        Returns:
-            dict:
-                - **encoder_out** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
-                  of shape `(batch, src_len, embed_dim)`
-                - **encoder_states** (List[Tensor]): all intermediate
-                  hidden states of shape `(src_len, batch, embed_dim)`.
-                  Only populated if *return_all_hiddens* is True.
-        """
-        return self.forward_scriptable(src_tokens,
-                                       src_lengths,
-                                       return_all_hiddens,
-                                       token_embeddings)
-    # TorchScript doesn't support super() method so that the scriptable Subclass
-    # can't access the base class model in Torchscript.
-    # Current workaround is to add a helper function with different name and
-    # call the helper function from scriptable Subclass.
-    def forward_scriptable(
-        self,
-        src_tokens,
-        src_lengths: Optional[torch.Tensor] = None,
-        return_all_hiddens: bool = False,
-        token_embeddings: Optional[torch.Tensor] = None,
-    ):
-        """
-        Args:
-            src_tokens (LongTensor): tokens in the source language of shape
-                `(batch, src_len)`
-            src_lengths (torch.LongTensor): lengths of each source sentence of
-                shape `(batch)`
-            return_all_hiddens (bool, optional): also return all of the
-                intermediate hidden states (default: False).
-            token_embeddings (torch.Tensor, optional): precomputed embeddings
-                default `None` will recompute embeddings
-        Returns:
-            dict:
-                - **encoder_out** (Tensor): the last encoder layer's output of
-                  shape `(src_len, batch, embed_dim)`
-                - **encoder_padding_mask** (ByteTensor): the positions of
-                  padding elements of shape `(batch, src_len)`
-                - **encoder_embedding** (Tensor): the (scaled) embedding lookup
-                  of shape `(batch, src_len, embed_dim)`
-                - **encoder_states** (List[Tensor]): all intermediate
-                  hidden states of shape `(src_len, batch, embed_dim)`.
-                  Only populated if *return_all_hiddens* is True.
-        """
-        if self.history is not None:
-            self.history.clean()
-        # compute padding mask
-        encoder_padding_mask = src_tokens.eq(self.padding_idx)
-        has_pads = (src_tokens.device.type == "xla" or encoder_padding_mask.any())
-        x, encoder_embedding = self.forward_embedding(src_tokens, token_embeddings)
-        # account for padding while computing the representation
-        if encoder_padding_mask is not None:
-            x = x * (1 - encoder_padding_mask.unsqueeze(-1).type_as(x))
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        encoder_states = []
-        if return_all_hiddens:
-            encoder_states.append(x)
-        # add emb into history
-        if self.history is not None:
-            self.history.add(x)
-        # encoder layers
-        for layer in self.layers:
-            if self.history is not None:
-                x = self.history.pop()
-            x = layer(
-                x, encoder_padding_mask=encoder_padding_mask if has_pads else None
-            )
-            if return_all_hiddens:
-                assert encoder_states is not None
-                encoder_states.append(x)
-            if self.history is not None:
-                self.history.add(x)
-        if self.history is not None:
-            x = self.history.pop()
-        if self.layer_norm is not None:
-            x = self.layer_norm(x)
-        # The Pytorch Mobile lite interpreter does not supports returning NamedTuple in
-        # `forward` so we use a dictionary instead.
-        # TorchScript does not support mixed values so the values are all lists.
-        # The empty list is equivalent to None.
-        return {
-            "encoder_out": [x],  # T x B x C
-            "encoder_padding_mask": [encoder_padding_mask],  # B x T
-            "encoder_embedding": [encoder_embedding],  # B x T x C
-            "encoder_states": encoder_states,  # List[T x B x C]
-            "src_tokens": [],
-            "src_lengths": [],
-        }
-class DLCLTransformerDecoder(TransformerDecoder):
-    """
-    Transformer decoder consisting of *args.decoder_layers* layers. Each layer
-    is a :class:`TransformerDecoderLayer`.
-    Args:
-        args (argparse.Namespace): parsed command-line arguments
-        dictionary (~fairseq.data.Dictionary): decoding dictionary
-        embed_tokens (torch.nn.Embedding): output embedding
-        no_encoder_attn (bool, optional): whether to attend to encoder outputs
-            (default: False).
-    """
-    def __init__(self, args, dictionary, embed_tokens, no_encoder_attn=False):
-        self.args = args
-        super().__init__(args, dictionary, embed_tokens, no_encoder_attn)
-        self.history = CreateLayerHistory(args, is_encoder=False)
-    def forward(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        features_only: bool = False,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-        src_lengths: Optional[Any] = None,
-        return_all_hiddens: bool = False,
-    ):
-        """
-        Args:
-            prev_output_tokens (LongTensor): previous decoder outputs of shape
-                `(batch, tgt_len)`, for teacher forcing
-            encoder_out (optional): output from the encoder, used for
-                encoder-side attention
-            incremental_state (dict): dictionary used for storing state during
-                :ref:`Incremental decoding`
-            features_only (bool, optional): only return features without
-                applying output layer (default: False).
-            full_context_alignment (bool, optional): don't apply
-                auto-regressive mask to self-attention (default: False).
-        Returns:
-            tuple:
-                - the decoder's output of shape `(batch, tgt_len, vocab)`
-                - a dictionary with any model-specific outputs
-        """
-        x, extra = self.extract_features(
-            prev_output_tokens,
-            encoder_out=encoder_out,
-            incremental_state=incremental_state,
-            full_context_alignment=full_context_alignment,
-            alignment_layer=alignment_layer,
-            alignment_heads=alignment_heads,
-        )
-        if not features_only:
-            x = self.output_layer(x)
-        return x, extra
-    def extract_features(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[Dict[str, List[Tensor]]],
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        return self.extract_features_scriptable(
-            prev_output_tokens,
-            encoder_out,
-            incremental_state,
-            full_context_alignment,
-            alignment_layer,
-            alignment_heads,
-        )
-    """
-    A scriptable subclass of this class has an extract_features method and calls
-    super().extract_features, but super() is not supported in torchscript. A copy of
-    this function is made to be used in the subclass instead.
-    """
-    def extract_features_scriptable(
-        self,
-        prev_output_tokens,
-        encoder_out: Optional[Dict[str, List[Tensor]]],
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
-    ):
-        """
-        Similar to *forward* but only return features.
-        Includes several features from "Jointly Learning to Align and
-        Translate with Transformer Models" (Garg et al., EMNLP 2019).
-        Args:
-            full_context_alignment (bool, optional): don't apply
-                auto-regressive mask to self-attention (default: False).
-            alignment_layer (int, optional): return mean alignment over
-                heads at this layer (default: last layer).
-            alignment_heads (int, optional): only average alignment over
-                this many heads (default: all heads).
-        Returns:
-            tuple:
-                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
-                - a dictionary with any model-specific outputs
-        """
-        if self.history is not None:
-            self.history.clean()
-        if alignment_layer is None:
-            alignment_layer = self.num_layers - 1
-        # embed positions
-        positions = None
-        if self.embed_positions is not None:
-            positions = self.embed_positions(
-                prev_output_tokens, incremental_state=incremental_state
-            )
-        if incremental_state is not None:
-            prev_output_tokens = prev_output_tokens[:, -1:]
-            if positions is not None:
-                positions = positions[:, -1:]
-        # embed tokens and positions
-        x = self.embed_scale * self.embed_tokens(prev_output_tokens)
-        if self.quant_noise is not None:
-            x = self.quant_noise(x)
-        if self.project_in_dim is not None:
-            x = self.project_in_dim(x)
-        if positions is not None and self.attn_type != "rel_selfattn":
-            x += positions
-        if self.layernorm_embedding is not None:
-            x = self.layernorm_embedding(x)
-        x = self.dropout_module(x)
-        # B x T x C -> T x B x C
-        x = x.transpose(0, 1)
-        # add emb into history
-        if self.history is not None:
-            self.history.add(x)
-        self_attn_padding_mask: Optional[Tensor] = None
-        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
-            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)
-        # decoder layers
-        attn: Optional[Tensor] = None
-        inner_states: List[Optional[Tensor]] = [x]
-        for idx, layer in enumerate(self.layers):
-            if incremental_state is None and not full_context_alignment:
-                self_attn_mask = self.buffered_future_mask(x)
-            else:
-                self_attn_mask = None
-            if self.history is not None:
-                x = self.history.pop()
-            x, layer_attn, _ = layer(
-                x,
-                encoder_out["encoder_out"][0]
-                if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0)
-                else None,
-                encoder_out["encoder_padding_mask"][0]
-                if (
-                    encoder_out is not None
-                    and len(encoder_out["encoder_padding_mask"]) > 0
-                )
-                else None,
-                incremental_state,
-                self_attn_mask=self_attn_mask,
-                self_attn_padding_mask=self_attn_padding_mask,
-                need_attn=bool((idx == alignment_layer)),
-                need_head_weights=bool((idx == alignment_layer)),
-                pos_emb=positions
-            )
-            inner_states.append(x)
-            if self.history is not None:
-                self.history.add(x)
-            if layer_attn is not None and idx == alignment_layer:
-                attn = layer_attn.float().to(x)
-        if attn is not None:
-            if alignment_heads is not None:
-                attn = attn[:alignment_heads]
-            # average probabilities over heads
-            attn = attn.mean(dim=0)
-        if self.history is not None:
-            x = self.history.pop()
-        if self.layer_norm is not None:
-            x = self.layer_norm(x)
-        # T x B x C -> B x T x C
-        x = x.transpose(0, 1)
-        if self.project_out_dim is not None:
-            x = self.project_out_dim(x)
-        return x, {"attn": [attn], "inner_states": inner_states}
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_tiny")
-def tiny_architecture(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 64)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 64)
-    args.encoder_layers = getattr(args, "encoder_layers", 2)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 2)
-    args.decoder_layers = getattr(args, "decoder_layers", 2)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 2)
-    return base_architecture(args)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer")
-def base_architecture(args):
-    args.encoder_embed_path = getattr(args, "encoder_embed_path", None)
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.encoder_learned_pos = getattr(args, "encoder_learned_pos", False)
-    args.decoder_embed_path = getattr(args, "decoder_embed_path", None)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
-    args.decoder_ffn_embed_dim = getattr(
-        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
-    )
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", False)
-    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.0)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.0)
-    args.activation_fn = getattr(args, "activation_fn", "relu")
-    args.dropout = getattr(args, "dropout", 0.1)
-    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
-    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
-    args.share_decoder_input_output_embed = getattr(
-        args, "share_decoder_input_output_embed", False
-    )
-    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
-    args.no_token_positional_embeddings = getattr(
-        args, "no_token_positional_embeddings", False
-    )
-    args.adaptive_input = getattr(args, "adaptive_input", False)
-    args.no_cross_attention = getattr(args, "no_cross_attention", False)
-    args.cross_self_attention = getattr(args, "cross_self_attention", False)
-    args.decoder_output_dim = getattr(
-        args, "decoder_output_dim", args.decoder_embed_dim
-    )
-    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
-    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
-    args.layernorm_embedding = getattr(args, "layernorm_embedding", False)
-    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
-    args.checkpoint_activations = getattr(args, "checkpoint_activations", False)
-    args.offload_activations = getattr(args, "offload_activations", False)
-    if args.offload_activations:
-        args.checkpoint_activations = True
-    args.encoder_layers_to_keep = getattr(args, "encoder_layers_to_keep", None)
-    args.decoder_layers_to_keep = getattr(args, "decoder_layers_to_keep", None)
-    args.encoder_layerdrop = getattr(args, "encoder_layerdrop", 0)
-    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0)
-    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
-    args.quant_noise_pq_block_size = getattr(args, "quant_noise_pq_block_size", 8)
-    args.quant_noise_scalar = getattr(args, "quant_noise_scalar", 0)
-    args.encoder_history_type = getattr(args, 'encoder_history_type', 'learnable_dense')
-    args.decoder_history_type = getattr(args, 'decoder_history_type', 'learnable_dense')
-    args.encoder_integration_type = getattr(args, 'encoder_integration_type', 'avg')
-    args.decoder_integration_type = getattr(args, 'decoder_integration_type', 'avg')
-    args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
-    args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)
-    args.k_only = getattr(args, 'k_only', True)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_relative")
-def dlcl_transformer_relative(args):
-    args.max_encoder_relative_length = 20
-    args.max_decoder_relative_length = 20
-    args.k_only = True
-    base_architecture(args)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_iwslt_de_en")
-def dlcl_transformer_iwslt_de_en(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 1024)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
-    args.encoder_layers = getattr(args, "encoder_layers", 6)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 512)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 1024)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
-    args.decoder_layers = getattr(args, "decoder_layers", 6)
-    base_architecture(args)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_wmt_en_de")
-def dlcl_transformer_wmt_en_de(args):
-    base_architecture(args)
-# parameters used in the "Attention Is All You Need" paper (Vaswani et al., 2017)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_vaswani_wmt_en_de_big")
-def dlcl_transformer_vaswani_wmt_en_de_big(args):
-    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
-    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
-    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
-    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
-    args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
-    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
-    args.dropout = getattr(args, "dropout", 0.3)
-    base_architecture(args)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_vaswani_wmt_en_fr_big")
-def dlcl_transformer_vaswani_wmt_en_fr_big(args):
-    args.dropout = getattr(args, "dropout", 0.1)
-    dlcl_transformer_vaswani_wmt_en_de_big(args)
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_wmt_en_de_big")
-def dlcl_transformer_wmt_en_de_big(args):
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    dlcl_transformer_vaswani_wmt_en_de_big(args)
-# default parameters used in tensor2tensor implementation
-@register_model_architecture("dlcl_transformer", "dlcl_transformer_wmt_en_de_big_t2t")
-def dlcl_transformer_wmt_en_de_big_t2t(args):
-    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
-    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
-    args.attention_dropout = getattr(args, "attention_dropout", 0.1)
-    args.activation_dropout = getattr(args, "activation_dropout", 0.1)
-    dlcl_transformer_vaswani_wmt_en_de_big(args)
--- a/fairseq/models/speech_to_text/__init__.py
+++ b/fairseq/models/speech_to_text/__init__.py
@@ -7,6 +7,7 @@ from .berard import *  # noqa
 from .ctc import *  # noqa
 from .convtransformer import *  # noqa
 from .s2t_transformer import *  # noqa
+from .multi_ctc_s2t_transformer import *  # noqa
 from .s2t_conformer import *  # noqa
 from .pdss2t_transformer import *  # noqa
 from .s2t_sate import *  # noqa
--- a/fairseq/models/speech_to_text/multi_ctc_s2t_transformer.py
+++ b/fairseq/models/speech_to_text/multi_ctc_s2t_transformer.py
+#!/usr/bin/env python3
+import logging
+import torch
+import torch.nn as nn
+from fairseq import checkpoint_utils, utils
+from fairseq.data.data_utils import lengths_to_padding_mask
+from fairseq.models import (
+    register_model,
+    register_model_architecture,
+)
+from fairseq.models.transformer import Embedding
+from fairseq.models.speech_to_text import (
+    S2TTransformerModel,
+    S2TTransformerEncoder,
+    CTC,
+    CTCCompressStrategy,
+)
+from fairseq.modules import (
+    LayerNorm
+)
+logger = logging.getLogger(__name__)
+class Adapter(nn.Module):
+    def __init__(self, args, dictionary, embed_tokens):
+        super().__init__()
+        embed_dim = args.encoder_embed_dim
+        self.adapter_type = args.adapter
+        if self.adapter_type in ["linear", "league", "gated_league", "gated_league2"]:
+            self.linear_adapter = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+                LayerNorm(args.encoder_embed_dim),
+                nn.ReLU(),
+            )
+        elif self.adapter_type == "linear2":
+            self.linear_adapter = nn.Sequential(
+                nn.Linear(embed_dim, embed_dim),
+            )
+        if self.adapter_type in ["embed", "context", "league", "gated_league", "gated_league2"]:
+            if embed_tokens is None:
+                num_embeddings = len(dictionary)
+                self.embed_adapter = Embedding(num_embeddings, embed_dim, self.padding_idx)
+            else:
+                self.embed_adapter = embed_tokens
+        if self.adapter_type == "gated_league":
+            self.gate_linear = nn.Linear(2 * embed_dim, embed_dim)
+        elif self.adapter_type == "gated_league2":
+            self.gate_linear1 = nn.Linear(embed_dim, embed_dim)
+            self.gate_linear2 = nn.Linear(embed_dim, embed_dim)
+        if self.adapter_type == "shrink":
+            self.ctc_compress_method = getattr(CTCCompressStrategy, args.ctc_compress_strategy)
+    def forward(self, x, padding):
+        representation, distribution = x
+        batch, seq_len, embed_dim = representation.size()
+        org_distribution = distribution
+        if distribution is not None:
+            distribution = distribution.view(-1, distribution.size(-1))
+        lengths = (~padding).long().sum(-1)
+        if self.adapter_type == "linear":
+            out = self.linear_adapter(representation)
+        elif self.adapter_type == "context":
+            out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+        elif self.adapter_type == "league":
+            linear_out = self.linear_adapter(representation)
+            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+            out = linear_out + soft_out
+        elif self.adapter_type == "gated_league":
+            linear_out = self.linear_adapter(representation)
+            soft_out = torch.mm(distribution, self.embed_adapter.weight).view(batch, seq_len, -1)
+            coef = (self.gate_linear(torch.cat([linear_out, soft_out], dim=-1))).sigmoid()
+            out = coef * linear_out + (1 - coef) * soft_out
+        elif self.adapter_type == "none":
+            out = representation
+        elif self.adapter_type == "shrink":
+            from itertools import groupby
+            with torch.no_grad():
+                batch_predicted = []
+                prob_ctc = org_distribution.transpose(0, 1)  # T x B x D -> B x T x D
+                for b in range(prob_ctc.shape[0]):
+                    predicted = prob_ctc[b][: lengths[b]].argmax(-1).tolist()
+                    batch_predicted.append([(p[0], len(list(p[1]))) for p in groupby(predicted)])
+                new_lengths = [len(p) for p in batch_predicted]
+                weights_matrix = self.ctc_compress_method(prob_ctc, batch_predicted, new_lengths,
+                                                          representation.dtype, representation.device)
+            # x is T x B x C -> B x C x T; weights_matrix is B x T x T'
+            compressed_output = representation.permute(1, 2, 0).bmm(weights_matrix)  # B x C x T'
+            out = compressed_output.permute(2, 0, 1)
+            out_lengths = lengths.new(new_lengths)
+            padding = lengths_to_padding_mask(out_lengths)
+        else:
+            out = None
+            logging.error("Unsupported adapter type: {}.".format(self.adapter_type))
+        return out, padding
+@register_model("multi_ctc_s2t_transformer")
+class MultiCTCS2TTransformerModel(S2TTransformerModel):
+    """Speech-to-Text Transformer with multiple CTC Loss in different layers"""
+    def __init__(self, encoder, decoder):
+        super().__init__(encoder, decoder)
+    @staticmethod
+    def add_args(parser):
+        S2TTransformerModel.add_args(parser)
+        parser.add_argument(
+            "--multi-ctc-layers",
+            default=None,
+            type=str,
+            help="the position of the ctc loss, separated by ",
+        )
+        parser.add_argument(
+            "--adapter",
+            default="league",
+            type=str,
+            help="adapter type",
+        )
+        parser.add_argument(
+            "--ctc-compress-strategy",
+            default="avg",
+            type=str,
+            help="compress strategy, such as avg, weighted, and softmax",
+        )
+        pass
+    @classmethod
+    def build_encoder(cls, args, task=None, embed_tokens=None):
+        encoder = S2TMultiCTCTransformerEncoder(args, task, embed_tokens)
+        if getattr(args, "load_pretrained_encoder_from", None):
+            encoder = checkpoint_utils.load_pretrained_component_from_model(
+                component=encoder, checkpoint=args.load_pretrained_encoder_from, strict=False
+            )
+            logger.info(
+                f"loaded pretrained encoder from: "
+                f"{args.load_pretrained_encoder_from}"
+            )
+        return encoder
+class S2TMultiCTCTransformerEncoder(S2TTransformerEncoder):
+    """Speech-to-text Transformer encoder that consists of multiple  input subsampler and
+    Conformer encoder."""
+    def __init__(self, args, task=None, embed_tokens=None):
+        super().__init__(args, task, embed_tokens)
+        if self.use_ctc:
+            del self.ctc
+        self.multi_ctc_layers = []
+        if args.multi_ctc_layers is not None:
+            multi_ctc_layers = args.multi_ctc_layers.split(",")
+            for layer_idx in multi_ctc_layers:
+                layer_idx = int(layer_idx)
+                if layer_idx <= 0:
+                    layer_idx += args.encoder_layers
+                self.multi_ctc_layers.append(layer_idx)
+                inter_ctc = True if layer_idx != args.encoder_layers else False
+                if inter_ctc:
+                    logger.info("Intermedia CTC loss in layer %d" % layer_idx)
+                ctc = CTC(args.encoder_embed_dim,
+                          dictionary_size=len(task.source_dictionary),
+                          dropout=args.dropout,
+                          need_layernorm=inter_ctc)
+                if task.source_dictionary == task.target_dictionary and embed_tokens is not None:
+                    ctc.ctc_projection.weight = embed_tokens.weight
+                setattr(self, f"ctc{layer_idx}", ctc)
+                if inter_ctc:
+                    adapter = Adapter(args, task.source_dictionary, ctc.ctc_projection)
+                    setattr(self, f"adapter{layer_idx}", adapter)
+    def forward(self, src_tokens, src_lengths):
+        if self.history is not None:
+            self.history.clean()
+        # down-sampling
+        x, input_lengths = self.subsample(src_tokens, src_lengths)
+        if type(x) == list:
+            inner_x = x
+            x = inner_x[-1]
+        # embedding scaling
+        x = self.embed_scale * x
+        # padding and position embedding
+        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
+        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
+        if self.attn_type != "rel_selfattn":
+            x += positions
+        x = self.dropout_module(x)
+        positions = self.dropout_module(positions)
+        # add emb into history
+        if self.history is not None:
+            self.history.push(x)
+        layer_idx = 0
+        ctc_logit = []
+        for layer in self.layers:
+            layer_idx += 1
+            if self.history is not None:
+                x = self.history.pop()
+            # encoder layer
+            x = layer(x, encoder_padding_mask, pos_emb=positions)
+            # interleave CTC
+            if self.use_ctc and layer_idx in self.multi_ctc_layers and layer_idx != len(self.layers):
+                ctc = getattr(self, f"ctc{layer_idx}")
+                adapter = getattr(self, f"adapter{layer_idx}")
+                logit = ctc(x)
+                prob = ctc.softmax(logit)
+                x, encoder_padding_mask = adapter([x, prob], encoder_padding_mask)
+                ctc_logit.append(ctc(x))
+            if layer_idx != len(self.layers) \
+                    and self.interleaved_dropout is not None \
+                    and layer_idx % self.interleaved_dropout == 0:
+                x = self.dropout_module(x)
+            if self.history is not None:
+                self.history.push(x)
+        if self.history is not None:
+            x = self.history.pop()
+        if self.layer_norm is not None:
+            x = self.layer_norm(x)
+        if self.use_ctc and len(self.layers) in self.multi_ctc_layers:
+            ctc = getattr(self, f"ctc{len(self.layers)}")
+            ctc_logit.append(ctc(x))
+        return {
+            "encoder_out": [x],  # T x B x C
+            "ctc_logit": ctc_logit,  # B x T x C
+            "encoder_padding_mask": [encoder_padding_mask],  # B x T
+            "encoder_embedding": [],  # B x T x C
+            "encoder_states": [],  # List[T x B x C]
+            "src_tokens": [],
+            "src_lengths": [],
+        }
+@register_model_architecture(model_name="multi_ctc_s2t_transformer", arch_name="multi_ctc_s2t_transformer")
+def base_architecture(args):
+    # Convolutional subsampler
+    args.conv_kernel_sizes = getattr(args, "conv_kernel_sizes", "5,5")
+    args.conv_channels = getattr(args, "conv_channels", 1024)
+    # Transformer
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 512)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 2048)
+    args.encoder_layers = getattr(args, "encoder_layers", 12)
+    args.encoder_attention_type = getattr(args, "encoder_attention_type", "selfattn")
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 8)
+    args.encoder_normalize_before = getattr(args, "encoder_normalize_before", True)
+    args.decoder_embed_dim = getattr(args, "decoder_embed_dim", args.encoder_embed_dim)
+    args.decoder_ffn_embed_dim = getattr(
+        args, "decoder_ffn_embed_dim", args.encoder_ffn_embed_dim
+    )
+    args.decoder_layers = getattr(args, "decoder_layers", 6)
+    args.decoder_attention_type = getattr(args, "decoder_attention_type", "selfattn")
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 8)
+    args.decoder_normalize_before = getattr(args, "decoder_normalize_before", True)
+    args.decoder_learned_pos = getattr(args, "decoder_learned_pos", False)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.attention_dropout = getattr(args, "attention_dropout", args.dropout)
+    args.activation_dropout = getattr(args, "activation_dropout", args.dropout)
+    args.activation_fn = getattr(args, "activation_fn", "relu")
+    args.adaptive_softmax_cutoff = getattr(args, "adaptive_softmax_cutoff", None)
+    args.adaptive_softmax_dropout = getattr(args, "adaptive_softmax_dropout", 0)
+    args.tie_adaptive_weights = getattr(args, "tie_adaptive_weights", False)
+    args.tie_adaptive_proj = getattr(args, "tie_adaptive_proj", False)
+    args.adaptive_softmax_factor = getattr(args, "adaptive_softmax_factor", 4)
+    args.share_decoder_input_output_embed = getattr(
+        args, "share_decoder_input_output_embed", False
+    )
+    args.share_all_embeddings = getattr(args, "share_all_embeddings", False)
+    args.no_token_positional_embeddings = getattr(
+        args, "no_token_positional_embeddings", False
+    )
+    args.adaptive_input = getattr(args, "adaptive_input", False)
+    args.decoder_layerdrop = getattr(args, "decoder_layerdrop", 0.0)
+    args.decoder_output_dim = getattr(
+        args, "decoder_output_dim", args.decoder_embed_dim
+    )
+    args.decoder_input_dim = getattr(args, "decoder_input_dim", args.decoder_embed_dim)
+    args.no_scale_embedding = getattr(args, "no_scale_embedding", False)
+    args.quant_noise_pq = getattr(args, "quant_noise_pq", 0)
+    # CTC
+    args.multi_ctc_layers = getattr(args, "multi_ctc_layers", 0)
+    # Conformer
+    args.macaron_style = getattr(args, "macaron_style", False)
+    args.use_cnn_module = getattr(args, "use_cnn_module", False)
+    args.cnn_module_kernel = getattr(args, "cnn_module_kernel", 31)
+    # settings for DLCL
+    args.use_enc_dlcl = getattr(args, "use_enc_dlcl", False)
+    args.use_dec_dlcl = getattr(args, "use_dec_dlcl", False)
+    args.init_value = getattr(args, 'init_value', 'avg')
+    args.weight_type = getattr(args, 'weight_type', 'scalar')
+    args.encoder_learnable = getattr(args, 'encoder_learnable', True)
+    args.decoder_learnable = getattr(args, 'decoder_learnable', True)
+    args.normalize_embed = getattr(args, 'normalize_embed', False)
+    args.history_dropout = getattr(args, 'history_dropout', 0.0)
+    args.history_window_size = getattr(args, 'history_window_size', -1)
+    # Relative position encoding
+    args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
+    args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)
+    args.k_only = getattr(args, 'k_only', True)
+    # local modeling
+    args.hard_mask_window = getattr(args, 'hard_mask_window', 0)
+    args.gauss_mask_sigma = getattr(args, 'gauss_mask_sigma', 0)
+    args.init_mask_weight = getattr(args, 'init_mask_weight', 0)
+    # interleaved dropout
+    args.interleave_dropout = getattr(args, "interleave_dropout", None)
+    args.cl_dropout = getattr(args, "cl_dropout", False)
+    args.cl_dropout_epoch = getattr(args, "cl_dropout_epoch", None)
+    args.cl_dropout_strategy = getattr(args, "cl_dropout_strategy", "linear")
+@register_model_architecture("multi_ctc_s2t_transformer", "multi_ctc_s2t_transformer_s")
+def multi_ctc_s2t_transformer_s(args):
+    args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 256)
+    args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 256 * 8)
+    args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 4)
+    args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 4)
+    args.dropout = getattr(args, "dropout", 0.1)
+    args.multi_ctc_layers = getattr(args, "multi_ctc_layers", None)
+    base_architecture(args)
--- a/fairseq/models/speech_to_text/pdss2t_transformer.py
+++ b/fairseq/models/speech_to_text/pdss2t_transformer.py
@@ -317,17 +317,22 @@ class PDSS2TTransformerModel(S2TTransformerModel):
            action='store_true',
            help="use dlcl encoder",
        )
-        parser.add_argument(
+        parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
-            '--encoder-history-type',
+                            help='how to init the learned weight matrix')
-            default="learnable_dense",
+        parser.add_argument('--weight-type', type=str, default='scalar',
-            help='encoder layer history type'
+                            help='type of learned weight [scalar, scalar_n(n>1), vector]')
-        )
+        parser.add_argument('--encoder-learnable', type=eval, default='True',
-        parser.add_argument(
+                            help='enable to learn weights for encoder')
-            '--decoder-history-type',
+        parser.add_argument('--decoder-learnable', type=eval, default='True',
-            default="learnable_dense",
+                            help='enable to learn weights for decoder')
-            help='decoder layer history type'
+        parser.add_argument('--normalize-learned-weight', type=eval, default='False',
-        )
+                            help='normalize learned weight by softmax')
+        parser.add_argument('--normalize-embedding', type=eval, default='False',
+                            help='normalize the input of embedding')
+        parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
+                            help='dropout for history output')
+        parser.add_argument('--history-window-size', type=int, default='-1',
+                            help='how many past layers are considered. -1 means all')
        # local modeling
        parser.add_argument(
            '--hard-mask-window',
@@ -375,7 +380,14 @@ class PDSS2TTransformerModel(S2TTransformerModel):
                 "The legacy relative positional encoding will be deprecated in the future."
                 "More Details can be found in https://github.com/espnet/espnet/pull/2816.",
        )
-        # CNN module
+        # CTC
+        parser.add_argument(
+            "--ctc-layer",
+            default=0,
+            type=int,
+            help="the position of the ctc loss",
+        )
+        # Conformer module
        parser.add_argument(
            "--use-cnn-module",
            default=False,
@@ -463,11 +475,6 @@ class PDSS2TTransformerModel(S2TTransformerModel):
            type=float,
            help="dropout in each stage",
        )
-        parser.add_argument(
-            "--ctc-layer",
-            type=int,
-            help="the layer of ctc",
-        )
        pass
    @classmethod

--- a/fairseq/models/speech_to_text/s2t_conformer.py
+++ b/fairseq/models/speech_to_text/s2t_conformer.py
@@ -299,7 +299,7 @@ class S2TConformerModel(S2TTransformerModel):
 class S2TConformerEncoder(S2TTransformerEncoder):
    """Speech-to-text Conformer encoder that consists of input subsampler and
-    Transformer encoder."""
+    Conformer encoder."""
    def __init__(self, args, task=None, embed_tokens=None):
        super().__init__(args, task, embed_tokens)

--- a/fairseq/models/speech_to_text/s2t_sate.py
+++ b/fairseq/models/speech_to_text/s2t_sate.py
@@ -16,7 +16,6 @@ from fairseq.models.transformer import Embedding, TransformerDecoder
 from fairseq.models.speech_to_text import (
    S2TTransformerModel,
    S2TTransformerEncoder,
-    S2TConformerEncoder,
    PDSS2TTransformerModel,
    PDSS2TTransformerEncoder,
    CTCCompressStrategy
@@ -27,7 +26,7 @@ from fairseq.modules import (
    LayerNorm,
    PositionalEmbedding,
    TransformerEncoderLayer,
-    LearnableDenseLayerHistory
+    DynamicLinearCombination
 )
 logger = logging.getLogger(__name__)
@@ -287,7 +286,7 @@ class TextEncoder(FairseqEncoder):
                x = history.pop()
            x = layer(x, encoder_padding_mask, pos_emb=positions)
            if history is not None:
-                history.add(x)
+                history.push(x)
        if history is not None:
            x = history.pop()
@@ -331,9 +330,7 @@ class S2TSATEEncoder(FairseqEncoder):
        if getattr(args, "use_enc_dlcl", False):
            layer_num = args.encoder_layers + args.text_encoder_layers + 1
-            self.history = LearnableDenseLayerHistory(
+            self.history = DynamicLinearCombination(args, is_encoder=True, layer_num=layer_num)
-                args.encoder_normalize_before, layer_num, args.encoder_embed_dim, True
-            )
        else:
            self.history = None
@@ -496,8 +493,8 @@ def base_architecture(args):
    args.ctc_layer = getattr(args, "ctc_layer", 0)
    args.pds_dropout = getattr(args, "pds_dropout", args.dropout)
-    args.fusion = getattr(args, "fusion", False)
+    args.pds_fusion = getattr(args, "pds_fusion", False)
-    args.fusion_method = getattr(args, "fusion_method", "all_conv")
+    args.pds_fusion_method = getattr(args, "pds_fusion_method", "all_conv")
 @register_model_architecture("s2t_sate", "s2t_sate_s")

--- a/fairseq/models/speech_to_text/s2t_transformer.py
+++ b/fairseq/models/speech_to_text/s2t_transformer.py
@@ -21,11 +21,10 @@ from fairseq.modules import (
    PositionalEmbedding,
    TransformerEncoderLayer,
    ConformerEncoderLayer,
-    CreateLayerHistory,
+    DynamicLinearCombination,
 )
 from torch import Tensor
 logger = logging.getLogger(__name__)
@@ -42,11 +41,11 @@ class Conv1dSubsampler(nn.Module):
    """
    def __init__(
-        self,
+            self,
-        in_channels: int,
+            in_channels: int,
-        mid_channels: int,
+            mid_channels: int,
-        out_channels: int,
+            out_channels: int,
-        kernel_sizes: List[int] = (3, 3),
+            kernel_sizes: List[int] = (3, 3),
    ):
        super(Conv1dSubsampler, self).__init__()
        self.n_layers = len(kernel_sizes)
@@ -277,16 +276,22 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
            action='store_true',
            help="use dlcl encoder",
        )
-        parser.add_argument(
+        parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
-            '--encoder-history-type',
+                            help='how to init the learned weight matrix')
-            default="learnable_dense",
+        parser.add_argument('--weight-type', type=str, default='scalar',
-            help='encoder layer history type'
+                            help='type of learned weight [scalar, scalar_n(n>1), vector]')
-        )
+        parser.add_argument('--encoder-learnable', type=eval, default='True',
-        parser.add_argument(
+                            help='enable to learn weights for encoder')
-            '--decoder-history-type',
+        parser.add_argument('--decoder-learnable', type=eval, default='True',
-            default="learnable_dense",
+                            help='enable to learn weights for decoder')
-            help='decoder layer history type'
+        parser.add_argument('--normalize-learned-weight', type=eval, default='False',
-        )
+                            help='normalize learned weight by softmax')
+        parser.add_argument('--normalize-embedding', type=eval, default='False',
+                            help='normalize the input of embedding')
+        parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
+                            help='dropout for history output')
+        parser.add_argument('--history-window-size', type=int, default='-1',
+                            help='how many past layers are considered. -1 means all')
        # CTC
        parser.add_argument(
            "--ctc-layer",
@@ -444,10 +449,10 @@ class S2TTransformerModel(FairseqEncoderDecoderModel):
        return cls(encoder, decoder)
    def get_normalized_probs(
-        self,
+            self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+            net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
+            log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
+            sample: Optional[Dict[str, Tensor]] = None,
    ):
        # net_output['encoder_out'] is a (B, T, D) tensor
        lprobs = self.get_normalized_probs_scriptable(net_output, log_probs, sample)
@@ -503,8 +508,8 @@ class S2TTransformerEncoder(FairseqEncoder):
        else:
            self.layer_norm = None
-        if getattr(args, "use_enc_dlcl", False):
+        if args.use_enc_dlcl:
-            self.history = CreateLayerHistory(args, is_encoder=True)
+            self.history = DynamicLinearCombination(args, is_encoder=True)
        else:
            self.history = None
@@ -588,7 +593,7 @@ class S2TTransformerEncoder(FairseqEncoder):
        # add emb into history
        if self.history is not None:
-            self.history.add(x)
+            self.history.push(x)
        # gather cosine similarity
        cos_sim_idx = (cos_sim_idx + 10) // 10 * 10 - 1
@@ -618,7 +623,7 @@ class S2TTransformerEncoder(FairseqEncoder):
                self.add_to_dict(x, dis, cos_sim_idx)
            if self.history is not None:
-                self.history.add(x)
+                self.history.push(x)
        if self.history is not None:
            x = self.history.pop()
@@ -631,7 +636,7 @@ class S2TTransformerEncoder(FairseqEncoder):
        return {
            "encoder_out": [x],  # T x B x C
-            "ctc_logit": [] if ctc_logit is None else [ctc_logit],    # B x T x C
+            "ctc_logit": [] if ctc_logit is None else [ctc_logit],  # B x T x C
            "encoder_padding_mask": [encoder_padding_mask],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
@@ -678,13 +683,13 @@ class S2TTransformerEncoder(FairseqEncoder):
 class TransformerDecoderScriptable(TransformerDecoder):
    def extract_features(
-        self,
+            self,
-        prev_output_tokens,
+            prev_output_tokens,
-        encoder_out: Optional[Dict[str, List[Tensor]]] = None,
+            encoder_out: Optional[Dict[str, List[Tensor]]] = None,
-        incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
-        full_context_alignment: bool = False,
+            full_context_alignment: bool = False,
-        alignment_layer: Optional[int] = None,
+            alignment_layer: Optional[int] = None,
-        alignment_heads: Optional[int] = None,
+            alignment_heads: Optional[int] = None,
    ):
        # call scriptable method from parent class
        x, _ = self.extract_features_scriptable(
@@ -698,10 +703,10 @@ class TransformerDecoderScriptable(TransformerDecoder):
        return x, None
    def get_normalized_probs_scriptable(
-        self,
+            self,
-        net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
+            net_output: Tuple[Tensor, Optional[Dict[str, List[Optional[Tensor]]]]],
-        log_probs: bool,
+            log_probs: bool,
-        sample: Optional[Dict[str, Tensor]] = None,
+            sample: Optional[Dict[str, Tensor]] = None,
    ):
        """Get normalized probabilities (or log probs) from a net's output."""
@@ -777,6 +782,17 @@ def base_architecture(args):
    args.use_cnn_module = getattr(args, "use_cnn_module", False)
    args.cnn_module_kernel = getattr(args, "cnn_module_kernel", 31)
+    # settings for DLCL
+    args.use_enc_dlcl = getattr(args, "use_enc_dlcl", False)
+    args.use_dec_dlcl = getattr(args, "use_dec_dlcl", False)
+    args.init_value = getattr(args, 'init_value', 'avg')
+    args.weight_type = getattr(args, 'weight_type', 'scalar')
+    args.encoder_learnable = getattr(args, 'encoder_learnable', True)
+    args.decoder_learnable = getattr(args, 'decoder_learnable', True)
+    args.normalize_embed = getattr(args, 'normalize_embed', False)
+    args.history_dropout = getattr(args, 'history_dropout', 0.0)
+    args.history_window_size = getattr(args, 'history_window_size', -1)
    # Relative position encoding
    args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
    args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)

--- a/fairseq/models/transformer.py
+++ b/fairseq/models/transformer.py
@@ -27,7 +27,7 @@ from fairseq.modules import (
    SinusoidalPositionalEmbedding,
    TransformerDecoderLayer,
    TransformerEncoderLayer,
-    CreateLayerHistory
+    DynamicLinearCombination
 )
 from fairseq.modules.checkpoint_activations import checkpoint_wrapper
 from fairseq.modules.quant_noise import quant_noise as apply_quant_noise_
@@ -218,6 +218,7 @@ class TransformerModel(FairseqEncoderDecoderModel):
            ],
            help="transformer decoder self-attention layer type"
        )
+        # DLCL parameters
        parser.add_argument(
            "--use-enc-dlcl",
            default=False,
@@ -230,16 +231,23 @@ class TransformerModel(FairseqEncoderDecoderModel):
            action='store_true',
            help="use dlcl encoder",
        )
-        parser.add_argument(
+        parser.add_argument('--init-value', type=str, default='avg', choices=['avg', 'one'],
-            '--encoder-history-type',
+                            help='how to init the learned weight matrix')
-            default="learnable_dense",
+        parser.add_argument('--weight-type', type=str, default='scalar',
-            help='encoder layer history type'
+                            help='type of learned weight [scalar, scalar_n(n>1), vector]')
-        )
+        parser.add_argument('--encoder-learnable', type=eval, default='True',
-        parser.add_argument(
+                            help='enable to learn weights for encoder')
-            '--decoder-history-type',
+        parser.add_argument('--decoder-learnable', type=eval, default='True',
-            default="learnable_dense",
+                            help='enable to learn weights for decoder')
-            help='decoder layer history type'
+        parser.add_argument('--normalize-learned-weight', type=eval, default='False',
-        )
+                            help='normalize learned weight by softmax')
+        parser.add_argument('--normalize-embedding', type=eval, default='False',
+                            help='normalize the input of embedding')
+        parser.add_argument('--history-dropout', type=float, default=0.0, metavar='D',
+                            help='dropout for history output')
+        parser.add_argument('--history-window-size', type=int, default='-1',
+                            help='how many past layers are considered. -1 means all')
+        # relative position representation
        parser.add_argument('--max-encoder-relative-length', type=int, default=-1,
                            help='the max encoder relative length')
        parser.add_argument('--max-decoder-relative-length', type=int, default=-1,
@@ -271,8 +279,16 @@ class TransformerModel(FairseqEncoderDecoderModel):
            metavar="STR",
            help="freeze the module of the decoder",
        )
        parser.add_argument('--interleave-dropout', default=0, type=float, metavar='D',
                            help='interleaved dropout probability')
+        parser.add_argument(
+            "--squeeze-excitation",
+            default=False,
+            action='store_true',
+            help="use squeeze and excitation method",
+        )
        # fmt: on
    @classmethod
@@ -496,8 +512,8 @@ class TransformerEncoder(FairseqEncoder):
        else:
            self.layer_norm = None
-        if getattr(args, "use_enc_dlcl", False):
+        if args.use_enc_dlcl:
-            self.history = CreateLayerHistory(args, is_encoder=True)
+            self.history = DynamicLinearCombination(args, is_encoder=True)
        else:
            self.history = None
@@ -617,7 +633,7 @@ class TransformerEncoder(FairseqEncoder):
        # add emb into history
        if self.history is not None:
-            self.history.add(x)
+            self.history.push(x)
        # encoder layers
        for layer in self.layers:
@@ -632,7 +648,7 @@ class TransformerEncoder(FairseqEncoder):
                encoder_states.append(x)
            if self.history is not None:
-                self.history.add(x)
+                self.history.push(x)
        if self.history is not None:
            x = self.history.pop()
@@ -826,8 +842,8 @@ class TransformerDecoder(FairseqIncrementalDecoder):
        else:
            self.layer_norm = None
-        if getattr(args, "use_dec_dlcl", False):
+        if args.use_dec_dlcl:
-            self.history = CreateLayerHistory(args, is_encoder=False)
+            self.history = DynamicLinearCombination(args, is_encoder=False)
        else:
            self.history = None
@@ -1010,7 +1026,7 @@ class TransformerDecoder(FairseqIncrementalDecoder):
        # add emb into history
        if self.history is not None:
-            self.history.add(x)
+            self.history.push(x)
        self_attn_padding_mask: Optional[Tensor] = None
        if self.cross_self_attention or prev_output_tokens.eq(self.padding_idx).any():
@@ -1051,7 +1067,7 @@ class TransformerDecoder(FairseqIncrementalDecoder):
            if layer_attn is not None and idx == alignment_layer:
                attn = layer_attn.float().to(x)
            if self.history is not None:
-                self.history.add(x)
+                self.history.push(x)
            if self.gather_attn_weight:
                if avg_attn is None:
                    avg_attn = layer_attn
@@ -1265,6 +1281,18 @@ def base_architecture(args):
    args.encoder_attention_type = getattr(args, "encoder_attention_type", "selfattn")
    args.decoder_attention_type = getattr(args, "decoder_attention_type", "selfattn")
+    # settings for DLCL
+    args.use_enc_dlcl = getattr(args, "use_enc_dlcl", False)
+    args.use_dec_dlcl = getattr(args, "use_dec_dlcl", False)
+    args.init_value = getattr(args, 'init_value', 'avg')
+    args.weight_type = getattr(args, 'weight_type', 'scalar')
+    args.encoder_learnable = getattr(args, 'encoder_learnable', True)
+    args.decoder_learnable = getattr(args, 'decoder_learnable', True)
+    args.normalize_embed = getattr(args, 'normalize_embed', False)
+    args.history_dropout = getattr(args, 'history_dropout', 0.0)
+    args.history_window_size = getattr(args, 'history_window_size', -1)
+    # settings for RPR
    args.max_encoder_relative_length = getattr(args, 'max_encoder_relative_length', -1)
    args.max_decoder_relative_length = getattr(args, 'max_decoder_relative_length', -1)
    args.k_only = getattr(args, 'k_only', True)

--- a/fairseq/modules/__init__.py
+++ b/fairseq/modules/__init__.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 """isort:skip_file"""
+from .squeeze_excitation import SEAttention
 from .adaptive_input import AdaptiveInput
 from .adaptive_softmax import AdaptiveSoftmax
 from .beamable_mm import BeamableMM
@@ -13,6 +14,7 @@ from .downsample_convolution import DownSampleConvolutionModule
 from .conv_tbc import ConvTBC
 from .cross_entropy import cross_entropy
 from .downsampled_multihead_attention import DownsampledMultiHeadAttention
+from .dlcl import DynamicLinearCombination
 from .dynamic_convolution import DynamicConv, DynamicConv1dTBC
 from .dynamic_crf_layer import DynamicCRF
 from .fairseq_dropout import FairseqDropout
@@ -22,7 +24,6 @@ from .grad_multiply import GradMultiply
 from .gumbel_vector_quantizer import GumbelVectorQuantizer
 from .kmeans_vector_quantizer import KmeansVectorQuantizer
 from .layer_drop import LayerDropModuleList
-from .layer_history import CreateLayerHistory, LearnableDenseLayerHistory
 from .layer_norm import Fp32LayerNorm, LayerNorm
 from .learned_positional_embedding import LearnedPositionalEmbedding
 from .lightweight_convolution import LightweightConv, LightweightConv1dTBC
@@ -46,6 +47,7 @@ from .conformer_layer import ConformerEncoderLayer
 from .pds_layer import PDSTransformerEncoderLayer
 __all__ = [
+    "DynamicLinearCombination",
    "AdaptiveInput",
    "AdaptiveSoftmax",
    "BeamableMM",
@@ -53,7 +55,6 @@ __all__ = [
    "ConformerEncoderLayer",
    "ConvolutionModule",
    "ConvTBC",
-    "CreateLayerHistory",
    "cross_entropy",
    "DownSampleConvolutionModule",
    "DownsampledMultiHeadAttention",
@@ -70,7 +71,6 @@ __all__ = [
    "KmeansVectorQuantizer",
    "LayerDropModuleList",
    "LayerNorm",
-    "LearnableDenseLayerHistory",
    "LearnedPositionalEmbedding",
    "LightweightConv1dTBC",
    "LightweightConv",
@@ -84,6 +84,7 @@ __all__ = [
    "RelativeMultiheadAttention",
    "SamePad",
    "ScalarBias",
+    "SEAttention",
    "SinusoidalPositionalEmbedding",
    "TransformerSentenceEncoderLayer",
    "TransformerSentenceEncoder",

--- a/fairseq/modules/dlcl.py
+++ b/fairseq/modules/dlcl.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+class DynamicLinearCombination(nn.Module):
+    """Implementation of Dynamic Linear Combination of Layers (DLCL)
+        for pre-norm, x_{l+1} = \sum_{k=0}^{l}{W_k^{l+1}LN(y_k)}
+        for post-norm, x_{l+1} = LN(\sum_{k=0}^{l}{W_k^{l+1}y_k})
+    """
+    def __init__(self, args, is_encoder, include_sublayer=False, layer_num=None):
+        super(DynamicLinearCombination, self).__init__()
+        self.normalize_learned_weight = args.normalize_learned_weight
+        self.normalized_weight = None
+        self.weight_type = args.weight_type
+        self.out_dropout = args.history_dropout
+        self.normalize_before = args.encoder_normalize_before if is_encoder else args.decoder_normalize_before
+        self.dim = args.encoder_embed_dim if is_encoder else args.decoder_embed_dim
+        # transformer encoder has 2 sub-layers, decoder has 3 sub-layers
+        if layer_num is None:
+            if include_sublayer:
+                layer_num = 1 + (2 * args.encoder_layers if is_encoder else 3 * args.decoder_layers)
+            else:
+                layer_num = 1 + (args.encoder_layers if is_encoder else args.decoder_layers)
+        # init weights and corresponding masks
+        learnable = args.encoder_learnable if is_encoder else args.decoder_learnable
+        self.weight, self.weight_mask = self._init(layer_num, args.init_value, args.weight_type,
+                                                   args.history_window_size, learnable)
+        # init triangular layer norm
+        if args.normalize_embed:
+            self.layer_norms = nn.ModuleList([nn.LayerNorm(self.dim) for _ in range(layer_num)])
+        else:
+            self.layer_norms = nn.ModuleList([nn.Sequential()] + [nn.LayerNorm(self.dim) for _ in range(layer_num-1)])
+        # states
+        self.count = 0
+        self.layers = []
+    @staticmethod
+    def _init_mask(n_layer, window_size):
+        mask = np.zeros([n_layer, n_layer], dtype=np.float32)
+        # all preceding layers
+        if window_size == -1:
+            for i in range(mask.shape[0]):
+                mask[i, :(i+1)] = 1
+        else:
+            for i in range(mask.shape[0]):
+                mask[i, max(0, i + 1 - window_size): (i+1)] = 1
+        return torch.from_numpy(mask)
+    @staticmethod
+    def _init_weight(np_mask, dim=1, init_value='avg', learnable=True):
+        np_weight = np.copy(np_mask)
+        if init_value == 'avg':
+            np_weight = np_weight / np.sum(np_weight, axis=1, keepdims=True)
+        elif init_value == 'one':
+            np_weight[:, :] = 1.
+        else:
+            raise ValueError('unknown init_value:{}'.format(init_value))
+        weight_tensor = torch.from_numpy(np_weight).unsqueeze(2)
+        if dim > 1:
+            weight_tensor = weight_tensor.repeat(1, 1, dim)
+        weight_tensor = torch.nn.Parameter(weight_tensor, requires_grad=learnable)
+        return weight_tensor
+    def _init(self, layer_num, init_value, weight_type, window_size=-1, learnable=True):
+        """
+        :param layer_num: total layers
+        :param init_value: initial weight value
+        :param weight_type: granularity of learned weights (scalar, scalar_X, vector)
+        :param window_size: past windows size of layers
+        :param learnable: if allow to learn weights
+        :return:
+            weight_tensor:
+                1. L x L x 1 if weight type='scalar'
+                2. L x L x X if weight type='scalar_X'
+                3. L x L x H if weight type='vector'
+            weight_mask: L x L, 0 means padding
+        """
+        """
+            weight shape is:
+             1. L x L x 1 for weight type='scalar'
+             2. L x L x X for weight type='scalar_X'
+             3. L x L x H for weight type='vector'
+             mask shape is L x L
+            :return:
+        """
+        # L x L
+        mask_tensor = self._init_mask(layer_num, window_size)
+        if weight_type == 'scalar':
+            self.last_dim = 1
+        elif weight_type == 'vector':
+            self.last_dim = self.dim
+        elif weight_type.startswith('scalar_'):
+            n = int(weight_type.split('_')[1])
+            assert self.dim % n == 0
+            self.last_dim = n
+        else:
+            raise ValueError('unknown weight_type:{}'.format(weight_type))
+        weight_tensor = self._init_weight(mask_tensor.numpy(), self.last_dim, init_value,
+                                          learnable=learnable)
+        return weight_tensor, mask_tensor
+    def push(self, layer):
+        self.count += 1
+        # first layer
+        if self.count == 1:
+            self.layers.append(self.layer_norms[0](layer))
+            # compatible when running on CPU
+            if layer.is_cuda and not self.weight_mask.is_cuda:
+                self.weight_mask = self.weight_mask.cuda()
+            if self.normalize_learned_weight:
+                weight = self.weight.masked_fill((self.weight_mask == 0).unsqueeze(2), float('-inf'))
+                self.normalized_weight = F.softmax(weight, dim=1)
+            return
+        # following layer
+        if self.normalize_before:
+            layer = self.layer_norms[self.count-1](layer)
+        self.layers.append(layer)
+    def _pick_weights(self):
+        weight = self.normalized_weight if self.normalize_learned_weight else self.weight
+        weight = weight[self.count - 1, : self.count, :].view(-1, 1, 1, self.last_dim)
+        return weight
+    def pop(self):
+        assert len(self.layers) > 0
+        # D x 1 x 1 x [1, H/G, H]
+        weights = self._pick_weights()
+        # D x T x B x H
+        layers = torch.stack(self.layers, 0)
+        # linear combination
+        if self.weight_type in ['scalar', 'vector']:
+            ret = (layers * weights).sum(0)
+        else:
+            D, T, B, H = layers.size()
+            layers = layers.view(D, T, B, -1, weights.size(-1))
+            weights = weights.unsqueeze(3)
+            ret = (layers * weights).sum(0).view(T, B, H)
+        if self.normalize_before:
+            if self.out_dropout > 0:
+                return F.dropout(ret, p=self.out_dropout, training=self.training)
+            else:
+                return ret
+        if self.out_dropout > 0:
+            return F.dropout(self.layer_norms[self.count-1](ret), p=self.out_dropout, training=self.training)
+        else:
+            return self.layer_norms[self.count-1](ret)
+    def clean(self):
+        self.count = 0
+        self.layers = []
+    def forward(self):
+        pass
--- a/fairseq/modules/layer_history.py
+++ b/fairseq/modules/layer_history.py
-import torch
-import torch.nn as nn
-from fairseq.modules.layer_norm import LayerNorm
-import queue
-import numpy as np
-def CreateLayerHistory(args, is_encoder):
-    history_type = args.encoder_history_type if is_encoder else args.decoder_history_type
-    normalize_before = args.encoder_normalize_before if is_encoder else args.decoder_normalize_before
-    layer_num = args.encoder_layers if is_encoder else args.decoder_layers
-    dim = args.encoder_embed_dim if is_encoder else args.decoder_embed_dim
-    if history_type is None:
-        return None
-    elif history_type == "residual":
-        return ResidualLayerHistory(normalize_before, layer_num, dim, is_encoder)
-    elif history_type == "dense":
-        integration_type = getattr(args, 'encoder_integration_type', 'avg') if is_encoder else \
-            getattr(args, 'decoder_integration_type', 'avg')
-        windows_size = getattr(args, 'encoder_windows_size', -1) if is_encoder else \
-            getattr(args, 'decoder_windows_size', -1)
-        return DenseLayerHistory(normalize_before, layer_num, dim, is_encoder, integration_type, windows_size)
-    elif history_type == "learnable_dense":
-        return LearnableDenseLayerHistory(normalize_before, layer_num, dim, is_encoder)
-    elif history_type == "learnable_dense_mask":
-        return LearnableDenseMaskLayerHistory(normalize_before, layer_num, dim, is_encoder)
-    elif history_type == "learnable_dense_nonorm":
-        return LearnableDenseNoNormLayerHistory(normalize_before, layer_num, dim, is_encoder)
-    elif history_type == "gru":
-        return GruLayerHistory(normalize_before, layer_num, dim, is_encoder)
-    else:
-        raise ValueError
-class BaseLayerHistory(nn.Module):
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(BaseLayerHistory, self).__init__()
-        self.is_encoder = is_encoder
-        self.normalize_before = normalize_before
-        # the first layer (aka. embedding layer) does not have layer normalization
-        self.layer_norms = nn.ModuleList(LayerNorm(dim) for _ in range(layer_num))
-    def add(self, layer):
-        raise NotImplemented
-    def pop(self):
-        raise NotImplemented
-    def clean(self):
-        raise NotImplemented
-class ResidualLayerHistory(BaseLayerHistory):
-    """
-    x_n = x_{n-1} + y_{n-1}
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(ResidualLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.count = 0
-        self.x = None
-        self.y = None
-    def add(self, layer):
-        if self.x is None:
-            self.x = layer
-            self.count += 1
-            return
-        self.count += 1
-        if self.normalize_before:
-            self.y = self.layer_norms[self.count - 2](layer)
-        else:
-            self.y = layer
-    def pop(self):
-        assert self.x is not None
-        if self.y is None:
-            return self.x
-        ret = self.x + self.y
-        if not self.normalize_before:
-            ret = self.layer_norms[self.count - 2](ret)
-        self.x = ret
-        return ret
-    def clean(self):
-        self.x = None
-        self.y = None
-        self.count = 0
-class DenseLayerHistory(BaseLayerHistory):
-    """
-    x_n = (x_1 + y_1 + y_2 + ... y_{n-1}) / n
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder, integration_type, windows_size):
-        super(DenseLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.sum = None
-        self.count = 0
-        self.individuals = None  # store past individual value, used for windows_size > 0
-        self.integration_type = integration_type
-        # windows = 1 means not use residual connection
-        self.windows_size = windows_size
-        if self.windows_size > 0:
-            assert self.windows_size <= 1 + layer_num
-            self.individuals = queue.Queue(self.windows_size)
-    def add(self, layer):
-        self.count += 1
-        # first layer
-        if self.sum is None:
-            self.sum = layer
-            if self.individuals is not None:
-                self.individuals.put(layer)
-            return
-        # following layer
-        if self.normalize_before:
-            layer = self.layer_norms[self.count - 2](layer)
-        self.sum = self.sum + layer
-        if self.windows_size != -1 and self.count > self.windows_size:
-            self.sum = self.sum - self.individuals.get()
-        if self.individuals is not None:
-            self.individuals.put(layer)
-    def pop(self):
-        assert self.sum is not None
-        if self.integration_type == 'sum':
-            ret = self.sum
-        else:
-            if self.windows_size == -1:
-                ret = self.sum / self.count
-            else:
-                ret = self.sum / min(self.count, self.windows_size)
-        if self.count == 1 or self.normalize_before:
-            return ret
-        return self.layer_norms[self.count - 2](ret)
-    def clean(self):
-        self.sum = None
-        self.count = 0
-        if self.individuals is not None:
-            self.individuals.queue.clear()
-class LearnableDenseLayerHistory(BaseLayerHistory):
-    """
-    x_n = (x_1 + y_1 + y_2 + ... y_{n-1}) / n
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(LearnableDenseLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.sum = None
-        self.count = 0
-        self.layer_num = 1 + layer_num
-        self.weight = nn.Parameter(torch.Tensor(self.layer_num, self.layer_num).fill_(1.0).tril())
-        self.weight.data = self.weight.data / self.weight.data.sum(1, keepdim=True)
-        self.layers = []
-    def extra_repr(self):
-        return 'n_layers={layer_num}, '.format(**self.__dict__)
-    def add(self, layer):
-        self.count += 1
-        # first layer
-        if self.sum is None:
-            self.sum = layer
-            self.layers.append(layer)
-            return
-        # following layer
-        if self.normalize_before:
-            layer = self.layer_norms[self.count - 2](layer)
-        self.layers.append(layer)
-    def pop(self):
-        assert len(self.layers) > 0
-        ret = (torch.stack(self.layers, 0) * self.weight[self.count - 1, : self.count].view(-1, 1, 1, 1)).sum(0)
-        if self.count == 1 or self.normalize_before:
-            return ret
-        return self.layer_norms[self.count - 2](ret)
-    def clean(self):
-        self.sum = None
-        self.count = 0
-        self.layers = []
-    def get_loss(self):
-        return (0.5 * (self.weight.sum(1) - 1.0) ** 2).mean()
-class LearnableDenseMaskLayerHistory(BaseLayerHistory):
-    """
-    x_n = (x_1 + y_1 + y_2 + ... y_{n-1}) / n
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(LearnableDenseMaskLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.sum = None
-        self.count = 0
-        self.layer_num = 1 + layer_num
-        if is_encoder:
-            self.weight_mask = np.loadtxt("encoder_mask.txt", dtype=float, delimiter=' ')
-        else:
-            self.weight_mask = np.loadtxt("decoder_mask.txt", dtype=float, delimiter=' ')
-        self.weight = nn.Parameter(torch.Tensor(self.layer_num, self.layer_num).fill_(1.0).tril())
-        self.weight.data = self.weight.data / self.weight.data.sum(1, keepdim=True)
-    def add(self, layer):
-        self.count += 1
-        # first layer
-        if self.sum is None:
-            self.sum = layer
-            self.layers.append(layer)
-            return
-        # following layer
-        if self.normalize_before:
-            layer = self.layer_norms[self.count - 2](layer)
-        self.layers.append(layer)
-    def pop(self):
-        assert len(self.layers) > 0
-        ret = (torch.stack(self.layers, 0) * self.weight[self.count - 1, : self.count].view(-1, 1, 1, 1)).sum(0)
-        if self.count == 1 or self.normalize_before:
-            return ret
-        return self.layer_norms[self.count - 2](ret)
-    def clean(self):
-        self.sum = None
-        self.count = 0
-        self.layers = []
-    def get_loss(self):
-        return (0.5 * (self.weight.sum(1) - 1.0) ** 2).mean()
-class LearnableDenseNoNormLayerHistory(BaseLayerHistory):
-    """
-    x_n = (x_1 + y_1 + y_2 + ... y_{n-1}) / n
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(LearnableDenseNoNormLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.sum = None
-        self.count = 0
-        self.layer_num = 1 + layer_num
-        self.weight = nn.Parameter(torch.Tensor(self.layer_num, self.layer_num).fill_(1.0).tril())
-        self.weight.data = self.weight.data / self.weight.data.sum(1, keepdim=True)
-        self.layers = []
-        self.layer_norms = None
-    def add(self, layer):
-        self.count += 1
-        # first layer
-        if self.sum is None:
-            self.sum = layer
-            self.layers.append(layer)
-            return
-        self.layers.append(layer)
-    def pop(self):
-        assert len(self.layers) > 0
-        ret = (torch.stack(self.layers, 0) * self.weight[self.count - 1, : self.count].view(-1, 1, 1, 1)).sum(0)
-        if self.count == 1 or self.normalize_before:
-            return ret
-        return self.layer_norms[self.count - 2](ret)
-    def clean(self):
-        self.sum = None
-        self.count = 0
-        self.layers = []
-class GruLayerHistory(BaseLayerHistory):
-    """
-    x_n = (x_1 + y_1 + y_2 + ... y_{n-1}) / n
-    """
-    def __init__(self, normalize_before, layer_num, dim, is_encoder):
-        super(GruLayerHistory, self).__init__(normalize_before, layer_num, dim, is_encoder)
-        self.count = 0
-        self.gru = nn.GRUCell(dim)
-        self.gru_cells = []
-        self.layer_norms = nn.ModuleList(LayerNorm(dim) for _ in range(layer_num + 1))
-        self.decoder_layers = layer_num
-    def compute_gru(self, layer_output):
-        if len(self.gru_cells) == 0:
-            self.gru_cells.append(layer_output)
-            return self.layer_norms[self.count](layer_output)
-        self.count += 1
-        prev_h = self.gru_cells[-1]
-        L, B, H = layer_output.size()
-        layer_output = torch.reshape(layer_output, (-1, H))
-        prev_h = torch.reshape(prev_h, (-1, H))
-        h = self.gru(layer_output, prev_h).view(L, B, H)
-        self.gru_cells.append(h)
-        if self.count != self.decoder_layers:
-            return self.layer_norms[self.count](h)
-        else:
-            return None
-    def clean(self):
-        self.gru_cells = []
-        self.count = 0
--- a/fairseq/modules/squeeze_excitation.py
+++ b/fairseq/modules/squeeze_excitation.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import torch.nn as nn
+class SEAttention(nn.Module):
+    def __init__(self, channel=512, reduction=16):
+        super(SEAttention, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool1d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )
+        self.init_weights()
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+    def forward(self, x):
+        x = x.permute(1, 2, 0)
+        b, c, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1)
+        x = x * y.expand_as(x)
+        x = x.permute(2, 0, 1)
+        return x
--- a/fairseq/modules/transformer_layer.py
+++ b/fairseq/modules/transformer_layer.py
@@ -14,6 +14,7 @@ from fairseq.modules import (
    RelPositionMultiheadAttention,
    RelativeMultiheadAttention,
    LocalMultiheadAttention,
+    SEAttention,
 )
 from fairseq.modules.fairseq_dropout import FairseqDropout
 from fairseq.modules.quant_noise import quant_noise
@@ -73,6 +74,10 @@ class TransformerEncoderLayer(nn.Module):
        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.use_se = getattr(args, "squeeze_excitation", False)
+        if self.use_se:
+            self.se_attn = SEAttention(self.embed_dim, 16)
    def build_fc1(self, input_dim, output_dim, q_noise, qn_block_size):
        return quant_noise(
            nn.Linear(input_dim, output_dim), p=q_noise, block_size=qn_block_size
@@ -211,6 +216,11 @@ class TransformerEncoderLayer(nn.Module):
        x = self.activation_fn(self.fc1(x))
        x = self.activation_dropout_module(x)
        x = self.fc2(x)
+        # use squeeze-and-excitation method
+        if self.use_se:
+            x = self.se_attn(x)
        x = self.dropout_module(x)
        x = self.residual_connection(x, residual)
        if not self.normalize_before: