Commit d4255246 by xuchen

optimize the implementation of the Efficient Conformer

parent 0bd92062
File mode changed from 100644 to 100755
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -29,7 +29,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -29,7 +29,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ zero_infinity: True ...@@ -12,7 +12,7 @@ zero_infinity: True
post-process: sentencepiece post-process: sentencepiece
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
arch: transformer arch: transformer
share-all-embeddings: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
......
arch: transformer arch: transformer
share-all-embeddings: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
......
...@@ -44,10 +44,10 @@ lcrm=1 ...@@ -44,10 +44,10 @@ lcrm=1
tokenizer=0 tokenizer=0
use_specific_dict=1 use_specific_dict=1
specific_prefix=st specific_prefix=asr5k_st10k
specific_dir=${root_dir}/data/mustc/st specific_dir=${root_dir}/data/${dataset}/st_lcrm_asr
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram5000_asr
tgt_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st
org_data_dir=${root_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt data_dir=${root_dir}/data/${dataset}/mt
...@@ -82,7 +82,6 @@ len_penalty=1.0 ...@@ -82,7 +82,6 @@ len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${exp_prefix}_${specific_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else else
if [[ "${tgt_vocab_type}" == "char" ]]; then if [[ "${tgt_vocab_type}" == "char" ]]; then
vocab_name=char vocab_name=char
...@@ -159,6 +158,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -159,6 +158,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="$cmd cmd="$cmd
--share" --share"
fi fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
else else
...@@ -171,13 +171,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -171,13 +171,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
for split in ${train_subset} ${valid_subset} ${trans_subset}; do for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{ {
if [[ -d ${org_data_dir}/data/${split}/txt ]]; then if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt text_dir=${org_data_dir}/data/${split}/txt
else else
txt_dir=${org_data_dir}/data/${split} text_dir=${org_data_dir}/data/${split}
fi fi
cmd="cat ${txt_dir}/${split}.${src_lang}" src_text=${text_dir}/${split}.${src_lang}
tgt_text=${text_dir}/${split}.${tgt_lang}
cmd="cat ${src_text}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}" cmd="python local/lower_rm.py ${src_text}"
fi fi
cmd="${cmd} cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model | spm_encode --model ${data_dir}/${src_vocab_prefix}.model
...@@ -190,7 +192,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -190,7 +192,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model --model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece --output_format=piece
< ${txt_dir}/${split}.${tgt_lang} < ${tgt_text}
> ${data_dir}/data/${split}.${tgt_lang}" > ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -329,11 +331,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -329,11 +331,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log $log mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &" log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi fi
fi fi
wait wait
......
...@@ -6,17 +6,17 @@ gpu_num=1 ...@@ -6,17 +6,17 @@ gpu_num=1
update_freq=1 update_freq=1
max_tokens=8192 max_tokens=8192
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=baseline exp_tag=baseline
config_list=(base) config_list=(base)
# exp full name # exp full name
exp_name= exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g') train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
......
File mode changed from 100644 to 100755
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -14,7 +14,7 @@ label_smoothing: 0.1 ...@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
arch: s2t_ctc arch: s2t_ctc
encoder-type: pds encoder-type: pds
#arch: pdss2t_transformer_s_8
#pds-ctc: 0_1_1_0 #pds-ctc: 0_1_1_0
#intermedia-adapter: league #intermedia-adapter: league
#intermedia-ctc-weight: 1 #intermedia-ctc-weight: 1
#encoder-attention-type: transfer #encoder-attention-type: reduced
#relative-pos-enc: True
encoder-attention-type: rel_pos
#pds-attn-ds-ratios: 4_2_1_1 #pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool #attention-reduced-method: pool
#attention-reduced-q: True #attention-reduced-q: True
encoder-embed-dim: 256
pds-stages: 4 encoder-embed-dim: 240
ctc-layer: 12 pds-stages: 3
pds-layers: 3_3_3_3 #ctc-layer: 15
pds-ratios: 2_2_1_2 pds-layers: 4_5_6
pds-fusion: True pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256 pds-embed-dims: 120_168_240
pds-ds-method: conv pds-ds-method: conv
pds-embed-norm: True pds-embed-norm: True
pds-position-embed: 1_1_1_1 pds-position-embed: 1_1_1
pds-kernel-sizes: 5_5_5_5 pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 8_8_8_8 pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4_4 pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 10000 warmup-updates: 10000
lr: 2e-3 lr: 0.0015
adam_betas: (0.9,0.98) adam_betas: (0.9,0.98)
criterion: ctc criterion: ctc
post-process: sentencepiece
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048 encoder-layers: 15
encoder-layers: 12
decoder-layers: 6 macaron-style: True
encoder-attention-heads: 4 use-cnn-module: True
cnn-module-kernel: 15
decoder-embed-dim: 256 encoder-activation-fn: swish
decoder-ffn-embed-dim: 2048 encoder-attention-type: rel_pos
decoder-attention-heads: 4
#load-pretrained-encoder-from:
...@@ -13,7 +13,7 @@ zero_infinity: True ...@@ -13,7 +13,7 @@ zero_infinity: True
post-process: sentencepiece post-process: sentencepiece
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
...@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -14,7 +14,7 @@ label_smoothing: 0.1 ...@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
arch: s2t_ctc arch: s2t_ctc
encoder-type: transformer
optimizer: adam optimizer: adam
#clip-norm: 10.0 #clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
...@@ -12,7 +14,7 @@ criterion: ctc ...@@ -12,7 +14,7 @@ criterion: ctc
post-process: sentencepiece post-process: sentencepiece
subsampling-type: conv2d subsampling-type: conv2d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 176 subsampling-filter: 176
subsampling-kernel: 3 subsampling-kernel: 3
subsampling-stride: 2 subsampling-stride: 2
......
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
pds-stages: 3
pds-layers: 4_6_6
pds-ratios: -1_0_0
pds-conv-strides: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 180_256_360
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
subsampling-type: conv2d
subsampling-layers: 1
subsampling-filter: 180
subsampling-kernel: 3
subsampling-stride: 2
subsampling-norm: batch2d
subsampling-activation: swish
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
weight-decay: 1e-6
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 360
encoder-layers: 15
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
\ No newline at end of file
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
pds-stages: 3
pds-layers: 5_5_5
pds-ratios: -1_0_0
pds-conv-strides: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
subsampling-type: conv2d
subsampling-layers: 1
subsampling-filter: 120
subsampling-kernel: 3
subsampling-stride: 2
subsampling-norm: batch2d
subsampling-activation: swish
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
weight-decay: 1e-6
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 240
encoder-layers: 15
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
\ No newline at end of file
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -11,7 +11,7 @@ criterion: ctc ...@@ -11,7 +11,7 @@ criterion: ctc
post-process: sentencepiece post-process: sentencepiece
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -38,14 +38,7 @@ criterion: ctc ...@@ -38,14 +38,7 @@ criterion: ctc
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from: #load-pretrained-decoder-from:
\ No newline at end of file
...@@ -38,10 +38,7 @@ post-process: sentencepiece ...@@ -38,10 +38,7 @@ post-process: sentencepiece
dropout: 0.1 dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12 encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True macaron-style: True
use-cnn-module: True use-cnn-module: True
......
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 240
pds-stages: 3
#ctc-layer: 15
pds-layers: 4_5_6
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-layers: 15
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
#load-pretrained-encoder-from:
File mode changed from 100644 to 100755
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -12,7 +12,7 @@ zero_infinity: True ...@@ -12,7 +12,7 @@ zero_infinity: True
post-process: sentencepiece post-process: sentencepiece
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc ...@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 1024 subsampling-filter: 1024
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -15,7 +15,7 @@ encoder-normalize-before: True ...@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
...@@ -14,7 +14,7 @@ label_smoothing: 0.1 ...@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
subsampling-type: conv1d subsampling-type: conv1d
subsmapling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
subsampling-kernel: 5 subsampling-kernel: 5
subsampling-stride: 2 subsampling-stride: 2
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
...@@ -2,21 +2,21 @@ ...@@ -2,21 +2,21 @@
# training the model # training the model
gpu_num=8 gpu_num=4
update_freq=1 update_freq=4
max_tokens=8192 max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag= extra_tag=
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} " #extra_parameter="${extra_parameter} "
exp_tag=baseline
config_list=(deep)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g') train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh cmd="./run.sh
......
File mode changed from 100644 to 100755
#! /bin/bash #! /bin/bash
# Processing WMT16 En-De Datasets # Processing WMT20 En-Zh Datasets
# Copyright 2021 Natural Language Processing Laboratory # Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com) # Xu Chen (xuchenneu@163.com)
...@@ -35,18 +35,19 @@ lang=${src_lang}-${tgt_lang} ...@@ -35,18 +35,19 @@ lang=${src_lang}-${tgt_lang}
dataset=wmt20 dataset=wmt20
task=translation task=translation
vocab_type=unigram src_vocab_type=unigram
vocab_size=32000 tgt_vocab_type=unigram
src_vocab_size=32000
tgt_vocab_size=32000
share_dict=0 share_dict=0
lcrm=1 lcrm=1
tokenizer=1 tokenizer=1
use_specific_dict=0 use_specific_dict=1
subword=0 specific_prefix=asr5k_st10k
specific_prefix=subword32000_share specific_dir=${root_dir}/data/iwslt2022/st_lcrm_asr
specific_dir=${root_dir}/data/mustc/st src_vocab_prefix=spm_unigram5000_asr
src_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st
tgt_vocab_prefix=spm_unigram10000_st_share
org_data_dir=${root_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt data_dir=${root_dir}/data/${dataset}/mt
...@@ -81,17 +82,24 @@ len_penalty=1.0 ...@@ -81,17 +82,24 @@ len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${exp_prefix}_${specific_prefix} exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix} data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else else
if [[ "${vocab_type}" == "char" ]]; then if [[ "${tgt_vocab_type}" == "char" ]]; then
vocab_name=${vocab_type} vocab_name=char
exp_prefix=${exp_prefix}_${vocab_type} exp_prefix=${exp_prefix}_char
else else
vocab_name=${vocab_type}${vocab_size} if [[ ${src_vocab_size} -ne ${tgt_vocab_size} || "${src_vocab_type}" -ne "${tgt_vocab_type}" ]]; then
src_vocab_name=${src_vocab_type}${src_vocab_size}
tgt_vocab_name=${tgt_vocab_type}${tgt_vocab_size}
vocab_name=${src_vocab_name}_${tgt_vocab_name}
else
vocab_name=${tgt_vocab_type}${tgt_vocab_size}
src_vocab_name=${vocab_name}
tgt_vocab_name=${vocab_name}
fi
fi fi
data_dir=${data_dir}/${vocab_name} data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang} src_vocab_prefix=spm_${src_vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang} tgt_vocab_prefix=spm_${tgt_vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_name}_share src_vocab_prefix=spm_${vocab_name}_share
...@@ -103,6 +111,9 @@ if [[ ${lcrm} -eq 1 ]]; then ...@@ -103,6 +111,9 @@ if [[ ${lcrm} -eq 1 ]]; then
exp_prefix=${exp_prefix}_lcrm exp_prefix=${exp_prefix}_lcrm
fi fi
if [[ ${tokenizer} -eq 1 ]]; then if [[ ${tokenizer} -eq 1 ]]; then
train_subset=${train_subset}.tok
valid_subset=${valid_subset}.tok
trans_subset=${trans_subset}.tok
data_dir=${data_dir}_tok data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok exp_prefix=${exp_prefix}_tok
fi fi
...@@ -139,16 +150,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -139,16 +150,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--splits ${train_subset},${valid_subset},${trans_subset} --splits ${train_subset},${valid_subset},${trans_subset}
--src-lang ${src_lang} --src-lang ${src_lang}
--tgt-lang ${tgt_lang} --tgt-lang ${tgt_lang}
--vocab-type ${vocab_type} --src-vocab-type ${src_vocab_type}
--vocab-size ${vocab_size}" --tgt-vocab-type ${tgt_vocab_type}
--src-vocab-size ${src_vocab_size}
--tgt-vocab-size ${tgt_vocab_size}"
if [[ $share_dict -eq 1 ]]; then if [[ $share_dict -eq 1 ]]; then
cmd="$cmd cmd="$cmd
--share" --share"
fi fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd} [[ $eval -eq 1 ]] && eval ${cmd}
...@@ -168,10 +177,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -168,10 +177,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi fi
src_text=${text_dir}/${split}.${src_lang} src_text=${text_dir}/${split}.${src_lang}
tgt_text=${text_dir}/${split}.${tgt_lang} tgt_text=${text_dir}/${split}.${tgt_lang}
if [[ ${tokenizer} -eq 1 ]]; then
src_text=${text_dir}/${split}.tok.${src_lang}
tgt_text=${text_dir}/${split}.tok.${tgt_lang}
fi
cmd="cat ${src_text}" cmd="cat ${src_text}"
if [[ ${lcrm} -eq 1 ]]; then if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_text}" cmd="python local/lower_rm.py ${src_text}"
...@@ -327,16 +332,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -327,16 +332,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
log=${model_dir}/train.log log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &" cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log} tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi fi
wait
echo -e " >> finish training \n"
fi fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding" echo "stage 2: MT Decoding"
...@@ -381,15 +384,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -381,15 +384,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--results-path ${model_dir} --results-path ${model_dir}
--max-tokens ${max_tokens} --max-tokens ${max_tokens}
--beam ${beam_size} --beam ${beam_size}
--lenpen ${len_penalty}" --lenpen ${len_penalty}
if [[ ${subword} -eq 1 ]]; then
cmd="${cmd}
--post-process subword_nmt"
else
cmd="${cmd}
--post-process sentencepiece" --post-process sentencepiece"
fi
if [[ ${sacrebleu} -eq 1 ]]; then if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd} cmd="${cmd}
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
# training the model # training the model
gpu_num=8 gpu_num=4
update_freq=2 update_freq=4
max_tokens=8192 max_tokens=8192
exp_tag=baseline exp_tag=baseline
......
...@@ -620,9 +620,9 @@ class S2TTransformerEncoder(FairseqEncoder): ...@@ -620,9 +620,9 @@ class S2TTransformerEncoder(FairseqEncoder):
self.add_to_dict(src_tokens.transpose(0, 1), dis, cos_sim_idx) self.add_to_dict(src_tokens.transpose(0, 1), dis, cos_sim_idx)
# down-sampling # down-sampling
x, input_lengths = self.subsample(src_tokens, src_lengths)
# (B, T, D) -> (T, B, D) # (B, T, D) -> (T, B, D)
x = x.transpose(0, 1) x = src_tokens.transpose(0, 1)
x, input_lengths = self.subsample(x, src_lengths)
# embedding scaling # embedding scaling
x = self.embed_scale * x x = self.embed_scale * x
......
...@@ -205,7 +205,7 @@ class LegacyRelPositionMultiHeadedAttention(RelPositionMultiHeadedAttention): ...@@ -205,7 +205,7 @@ class LegacyRelPositionMultiHeadedAttention(RelPositionMultiHeadedAttention):
Args: Args:
n_head (int): The number of heads. n_head (int): The number of heads.
n_feat (int): The number of features. n_feat (int): The number of features.
dropout_rate (float): Dropout rate. dropout (float): Dropout rate.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix. zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
""" """
def __init__(self, n_feat, n_head, dropout, zero_triu=False): def __init__(self, n_feat, n_head, dropout, zero_triu=False):
......
...@@ -104,6 +104,7 @@ class PDSTransformerEncoderLayer(nn.Module): ...@@ -104,6 +104,7 @@ class PDSTransformerEncoderLayer(nn.Module):
self.final_norm = LayerNorm(expand_embed_dim) self.final_norm = LayerNorm(expand_embed_dim)
# Convolution Residual # Convolution Residual
self.conv_stride = conv_stride
self.conv_res = nn.Sequential( self.conv_res = nn.Sequential(
Permute3D(1, 2, 0), Permute3D(1, 2, 0),
nn.Conv1d(embed_dim, expand_embed_dim, kernel_size=1, stride=conv_stride), nn.Conv1d(embed_dim, expand_embed_dim, kernel_size=1, stride=conv_stride),
...@@ -322,7 +323,7 @@ class PDSTransformerEncoderLayer(nn.Module): ...@@ -322,7 +323,7 @@ class PDSTransformerEncoderLayer(nn.Module):
x = self.conv_module(x) x = self.conv_module(x)
x = x.transpose(0, 1) x = x.transpose(0, 1)
x = residual + x x = self.conv_res(residual) + x
if not self.normalize_before: if not self.normalize_before:
x = self.conv_norm(x) x = self.conv_norm(x)
......
...@@ -144,8 +144,8 @@ class Conv1dSubsampling(nn.Module): ...@@ -144,8 +144,8 @@ class Conv1dSubsampling(nn.Module):
def forward(self, x, x_len): def forward(self, x, x_len):
# (B, T, D) -> (B, D, T) # (T, B, D) -> (B, D, T)
x = x.transpose(1, 2) x = x.permute(1, 2, 0)
# Layers # Layers
for layer in self.layers: for layer in self.layers:
x = layer(x) x = layer(x)
...@@ -153,7 +153,9 @@ class Conv1dSubsampling(nn.Module): ...@@ -153,7 +153,9 @@ class Conv1dSubsampling(nn.Module):
# Update Sequence Lengths # Update Sequence Lengths
if x_len is not None: if x_len is not None:
x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1 x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
x = x.transpose(1, 2)
# (B, D, T) -> (T, B, D)
x = x.permute(2, 0, 1)
return x, x_len return x, x_len
...@@ -168,8 +170,8 @@ class Conv2dSubsampling(nn.Module): ...@@ -168,8 +170,8 @@ class Conv2dSubsampling(nn.Module):
act: activation function act: activation function
Shape: Shape:
Input: (batch_size, in_length, in_dim) Input: (in_length, batch_size in_dim)
Output: (batch_size, out_length, out_dim) Output: (out_length, batch_size, out_dim)
""" """
...@@ -199,8 +201,8 @@ class Conv2dSubsampling(nn.Module): ...@@ -199,8 +201,8 @@ class Conv2dSubsampling(nn.Module):
def forward(self, x, x_len): def forward(self, x, x_len):
# (B, T, D) -> (B, D, T) -> (B, 1, D, T) # (T, B, D) -> (B, D, T) -> (B, 1, D, T)
x = x.transpose(1, 2).unsqueeze(dim=1) x = x.permute(1, 2, 0).unsqueeze(dim=1)
# Layers # Layers
for layer in self.layers: for layer in self.layers:
...@@ -212,17 +214,17 @@ class Conv2dSubsampling(nn.Module): ...@@ -212,17 +214,17 @@ class Conv2dSubsampling(nn.Module):
# (B, C, D // S, T // S) -> (B, C * D // S, T // S) # (B, C, D // S, T // S) -> (B, C * D // S, T // S)
batch_size, channels, subsampled_dim, subsampled_length = x.size() batch_size, channels, subsampled_dim, subsampled_length = x.size()
x = x.reshape(batch_size, channels * subsampled_dim, subsampled_length).transpose(1, 2) x = x.reshape(batch_size, channels * subsampled_dim, subsampled_length).permute(2, 0, 1)
x = self.linear(x) x = self.linear(x)
return x, x_len return x, x_len
def subsampling(args): def subsampling(args, out_dim=None):
subsampling_type = getattr(args, "subsampling_type", "conv1d") subsampling_type = getattr(args, "subsampling_type", "conv1d")
layers = getattr(args, "subsampling_layers", 2) layers = getattr(args, "subsampling_layers", 2)
in_dim = args.input_feat_per_channel * args.input_channels in_dim = args.input_feat_per_channel * args.input_channels
filters = [getattr(args, "subsampling_filter")] + [args.encoder_embed_dim] filters = [getattr(args, "subsampling_filter")] + [args.encoder_embed_dim if out_dim is None else out_dim]
kernel_size = getattr(args, "subsampling_kernel", 5) kernel_size = getattr(args, "subsampling_kernel", 5)
stride = getattr(args, "subsampling_stride", 2) stride = getattr(args, "subsampling_stride", 2)
norm = getattr(args, "subsampling_norm", "none") norm = getattr(args, "subsampling_norm", "none")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论