Commit d4255246 by xuchen

optimize the implementation of the Efficient Conformer

parent 0bd92062
File mode changed from 100644 to 100755
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -29,7 +29,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ zero_infinity: True
post-process: sentencepiece
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
arch: transformer
share-all-embeddings: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
......
arch: transformer
share-all-embeddings: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
......
......@@ -44,10 +44,10 @@ lcrm=1
tokenizer=0
use_specific_dict=1
specific_prefix=st
specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
specific_prefix=asr5k_st10k
specific_dir=${root_dir}/data/${dataset}/st_lcrm_asr
src_vocab_prefix=spm_unigram5000_asr
tgt_vocab_prefix=spm_unigram10000_st
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt
......@@ -82,7 +82,6 @@ len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
if [[ "${tgt_vocab_type}" == "char" ]]; then
vocab_name=char
......@@ -159,6 +158,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="$cmd
--share"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
else
......@@ -171,13 +171,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
for split in ${train_subset} ${valid_subset} ${trans_subset}; do
{
if [[ -d ${org_data_dir}/data/${split}/txt ]]; then
txt_dir=${org_data_dir}/data/${split}/txt
text_dir=${org_data_dir}/data/${split}/txt
else
txt_dir=${org_data_dir}/data/${split}
text_dir=${org_data_dir}/data/${split}
fi
cmd="cat ${txt_dir}/${split}.${src_lang}"
src_text=${text_dir}/${split}.${src_lang}
tgt_text=${text_dir}/${split}.${tgt_lang}
cmd="cat ${src_text}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/data/${split}.${src_lang}"
cmd="python local/lower_rm.py ${src_text}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
......@@ -190,7 +192,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
cmd="spm_encode
--model ${data_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${txt_dir}/${split}.${tgt_lang}
< ${tgt_text}
> ${data_dir}/data/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......@@ -329,11 +331,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
wait
......
......@@ -6,17 +6,17 @@ gpu_num=1
update_freq=1
max_tokens=8192
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
......
File mode changed from 100644 to 100755
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
arch: s2t_ctc
encoder-type: pds
#arch: pdss2t_transformer_s_8
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: transfer
#relative-pos-enc: True
encoder-attention-type: rel_pos
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 256
pds-stages: 4
ctc-layer: 12
pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: True
encoder-embed-dim: 240
pds-stages: 3
#ctc-layer: 15
pds-layers: 4_5_6
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 256_256_256_256
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1_1
pds-kernel-sizes: 5_5_5_5
pds-ffn-ratios: 8_8_8_8
pds-attn-heads: 4_4_4_4
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-layers: 15
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
#load-pretrained-encoder-from:
......@@ -13,7 +13,7 @@ zero_infinity: True
post-process: sentencepiece
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
......@@ -13,7 +13,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
arch: s2t_ctc
encoder-type: transformer
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
......@@ -12,7 +14,7 @@ criterion: ctc
post-process: sentencepiece
subsampling-type: conv2d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 176
subsampling-kernel: 3
subsampling-stride: 2
......
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
pds-stages: 3
pds-layers: 4_6_6
pds-ratios: -1_0_0
pds-conv-strides: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 180_256_360
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
subsampling-type: conv2d
subsampling-layers: 1
subsampling-filter: 180
subsampling-kernel: 3
subsampling-stride: 2
subsampling-norm: batch2d
subsampling-activation: swish
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
weight-decay: 1e-6
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 360
encoder-layers: 15
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
\ No newline at end of file
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
pds-stages: 3
pds-layers: 5_5_5
pds-ratios: -1_0_0
pds-conv-strides: 2_2_1
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
subsampling-type: conv2d
subsampling-layers: 1
subsampling-filter: 120
subsampling-kernel: 3
subsampling-stride: 2
subsampling-norm: batch2d
subsampling-activation: swish
optimizer: adam
#clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
weight-decay: 1e-6
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 240
encoder-layers: 15
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
\ No newline at end of file
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -11,7 +11,7 @@ criterion: ctc
post-process: sentencepiece
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -38,14 +38,7 @@ criterion: ctc
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
......@@ -38,10 +38,7 @@ post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
......
arch: s2t_ctc
encoder-type: pds
#pds-ctc: 0_1_1_0
#intermedia-adapter: league
#intermedia-ctc-weight: 1
#encoder-attention-type: reduced
#pds-attn-ds-ratios: 4_2_1_1
#attention-reduced-method: pool
#attention-reduced-q: True
encoder-embed-dim: 240
pds-stages: 3
#ctc-layer: 15
pds-layers: 4_5_6
pds-ratios: 2_2_2
pds-fusion: False
pds-fusion-method: all_conv
pds-embed-dims: 120_168_240
pds-ds-method: conv
pds-embed-norm: True
pds-position-embed: 1_1_1
pds-kernel-sizes: 3_3_3
pds-ffn-ratios: 4_4_4
pds-attn-heads: 4_4_4
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 0.0015
adam_betas: (0.9,0.98)
criterion: ctc
post-process: sentencepiece
dropout: 0.1
activation-fn: relu
encoder-layers: 15
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-activation-fn: swish
encoder-attention-type: rel_pos
#load-pretrained-encoder-from:
File mode changed from 100644 to 100755
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -12,7 +12,7 @@ zero_infinity: True
post-process: sentencepiece
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
......@@ -12,7 +12,7 @@ criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -15,7 +15,7 @@ encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
......@@ -14,7 +14,7 @@ label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
subsampling-type: conv1d
subsmapling-layers: 2
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
......
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -2,21 +2,21 @@
# training the model
gpu_num=8
update_freq=1
gpu_num=4
update_freq=4
max_tokens=8192
exp_tag=baseline
config_list=(base)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=baseline
config_list=(deep)
# exp full name
exp_name=
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
......
File mode changed from 100644 to 100755
#! /bin/bash
# Processing WMT16 En-De Datasets
# Processing WMT20 En-Zh Datasets
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
......@@ -35,18 +35,19 @@ lang=${src_lang}-${tgt_lang}
dataset=wmt20
task=translation
vocab_type=unigram
vocab_size=32000
src_vocab_type=unigram
tgt_vocab_type=unigram
src_vocab_size=32000
tgt_vocab_size=32000
share_dict=0
lcrm=1
tokenizer=1
use_specific_dict=0
subword=0
specific_prefix=subword32000_share
specific_dir=${root_dir}/data/mustc/st
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
use_specific_dict=1
specific_prefix=asr5k_st10k
specific_dir=${root_dir}/data/iwslt2022/st_lcrm_asr
src_vocab_prefix=spm_unigram5000_asr
tgt_vocab_prefix=spm_unigram10000_st
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/mt
......@@ -81,17 +82,24 @@ len_penalty=1.0
if [[ ${use_specific_dict} -eq 1 ]]; then
exp_prefix=${exp_prefix}_${specific_prefix}
data_dir=${data_dir}/${specific_prefix}
mkdir -p ${data_dir}
else
if [[ "${vocab_type}" == "char" ]]; then
vocab_name=${vocab_type}
exp_prefix=${exp_prefix}_${vocab_type}
if [[ "${tgt_vocab_type}" == "char" ]]; then
vocab_name=char
exp_prefix=${exp_prefix}_char
else
vocab_name=${vocab_type}${vocab_size}
if [[ ${src_vocab_size} -ne ${tgt_vocab_size} || "${src_vocab_type}" -ne "${tgt_vocab_type}" ]]; then
src_vocab_name=${src_vocab_type}${src_vocab_size}
tgt_vocab_name=${tgt_vocab_type}${tgt_vocab_size}
vocab_name=${src_vocab_name}_${tgt_vocab_name}
else
vocab_name=${tgt_vocab_type}${tgt_vocab_size}
src_vocab_name=${vocab_name}
tgt_vocab_name=${vocab_name}
fi
fi
data_dir=${data_dir}/${vocab_name}
src_vocab_prefix=spm_${vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${vocab_name}_${tgt_lang}
src_vocab_prefix=spm_${src_vocab_name}_${src_lang}
tgt_vocab_prefix=spm_${tgt_vocab_name}_${tgt_lang}
if [[ $share_dict -eq 1 ]]; then
data_dir=${data_dir}_share
src_vocab_prefix=spm_${vocab_name}_share
......@@ -103,6 +111,9 @@ if [[ ${lcrm} -eq 1 ]]; then
exp_prefix=${exp_prefix}_lcrm
fi
if [[ ${tokenizer} -eq 1 ]]; then
train_subset=${train_subset}.tok
valid_subset=${valid_subset}.tok
trans_subset=${trans_subset}.tok
data_dir=${data_dir}_tok
exp_prefix=${exp_prefix}_tok
fi
......@@ -139,16 +150,14 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--splits ${train_subset},${valid_subset},${trans_subset}
--src-lang ${src_lang}
--tgt-lang ${tgt_lang}
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
--src-vocab-type ${src_vocab_type}
--tgt-vocab-type ${tgt_vocab_type}
--src-vocab-size ${src_vocab_size}
--tgt-vocab-size ${tgt_vocab_size}"
if [[ $share_dict -eq 1 ]]; then
cmd="$cmd
--share"
fi
if [[ ${tokenizer} -eq 1 ]]; then
cmd="$cmd
--tokenizer"
fi
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
......@@ -168,10 +177,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
src_text=${text_dir}/${split}.${src_lang}
tgt_text=${text_dir}/${split}.${tgt_lang}
if [[ ${tokenizer} -eq 1 ]]; then
src_text=${text_dir}/${split}.tok.${src_lang}
tgt_text=${text_dir}/${split}.tok.${tgt_lang}
fi
cmd="cat ${src_text}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_text}"
......@@ -327,16 +332,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
export CUDA_VISIBLE_DEVICES=${device}
log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
wait
echo -e " >> finish training \n"
fi
wait
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
echo "stage 2: MT Decoding"
......@@ -381,15 +384,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--results-path ${model_dir}
--max-tokens ${max_tokens}
--beam ${beam_size}
--lenpen ${len_penalty}"
if [[ ${subword} -eq 1 ]]; then
cmd="${cmd}
--post-process subword_nmt"
else
cmd="${cmd}
--lenpen ${len_penalty}
--post-process sentencepiece"
fi
if [[ ${sacrebleu} -eq 1 ]]; then
cmd="${cmd}
......
......@@ -2,8 +2,8 @@
# training the model
gpu_num=8
update_freq=2
gpu_num=4
update_freq=4
max_tokens=8192
exp_tag=baseline
......
......@@ -620,9 +620,9 @@ class S2TTransformerEncoder(FairseqEncoder):
self.add_to_dict(src_tokens.transpose(0, 1), dis, cos_sim_idx)
# down-sampling
x, input_lengths = self.subsample(src_tokens, src_lengths)
# (B, T, D) -> (T, B, D)
x = x.transpose(0, 1)
x = src_tokens.transpose(0, 1)
x, input_lengths = self.subsample(x, src_lengths)
# embedding scaling
x = self.embed_scale * x
......
......@@ -205,7 +205,7 @@ class LegacyRelPositionMultiHeadedAttention(RelPositionMultiHeadedAttention):
Args:
n_head (int): The number of heads.
n_feat (int): The number of features.
dropout_rate (float): Dropout rate.
dropout (float): Dropout rate.
zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
"""
def __init__(self, n_feat, n_head, dropout, zero_triu=False):
......
......@@ -104,6 +104,7 @@ class PDSTransformerEncoderLayer(nn.Module):
self.final_norm = LayerNorm(expand_embed_dim)
# Convolution Residual
self.conv_stride = conv_stride
self.conv_res = nn.Sequential(
Permute3D(1, 2, 0),
nn.Conv1d(embed_dim, expand_embed_dim, kernel_size=1, stride=conv_stride),
......@@ -322,7 +323,7 @@ class PDSTransformerEncoderLayer(nn.Module):
x = self.conv_module(x)
x = x.transpose(0, 1)
x = residual + x
x = self.conv_res(residual) + x
if not self.normalize_before:
x = self.conv_norm(x)
......
......@@ -144,8 +144,8 @@ class Conv1dSubsampling(nn.Module):
def forward(self, x, x_len):
# (B, T, D) -> (B, D, T)
x = x.transpose(1, 2)
# (T, B, D) -> (B, D, T)
x = x.permute(1, 2, 0)
# Layers
for layer in self.layers:
x = layer(x)
......@@ -153,7 +153,9 @@ class Conv1dSubsampling(nn.Module):
# Update Sequence Lengths
if x_len is not None:
x_len = torch.div(x_len - 1, 2, rounding_mode='floor') + 1
x = x.transpose(1, 2)
# (B, D, T) -> (T, B, D)
x = x.permute(2, 0, 1)
return x, x_len
......@@ -168,8 +170,8 @@ class Conv2dSubsampling(nn.Module):
act: activation function
Shape:
Input: (batch_size, in_length, in_dim)
Output: (batch_size, out_length, out_dim)
Input: (in_length, batch_size in_dim)
Output: (out_length, batch_size, out_dim)
"""
......@@ -199,8 +201,8 @@ class Conv2dSubsampling(nn.Module):
def forward(self, x, x_len):
# (B, T, D) -> (B, D, T) -> (B, 1, D, T)
x = x.transpose(1, 2).unsqueeze(dim=1)
# (T, B, D) -> (B, D, T) -> (B, 1, D, T)
x = x.permute(1, 2, 0).unsqueeze(dim=1)
# Layers
for layer in self.layers:
......@@ -212,17 +214,17 @@ class Conv2dSubsampling(nn.Module):
# (B, C, D // S, T // S) -> (B, C * D // S, T // S)
batch_size, channels, subsampled_dim, subsampled_length = x.size()
x = x.reshape(batch_size, channels * subsampled_dim, subsampled_length).transpose(1, 2)
x = x.reshape(batch_size, channels * subsampled_dim, subsampled_length).permute(2, 0, 1)
x = self.linear(x)
return x, x_len
def subsampling(args):
def subsampling(args, out_dim=None):
subsampling_type = getattr(args, "subsampling_type", "conv1d")
layers = getattr(args, "subsampling_layers", 2)
in_dim = args.input_feat_per_channel * args.input_channels
filters = [getattr(args, "subsampling_filter")] + [args.encoder_embed_dim]
filters = [getattr(args, "subsampling_filter")] + [args.encoder_embed_dim if out_dim is None else out_dim]
kernel_size = getattr(args, "subsampling_kernel", 5)
stride = getattr(args, "subsampling_stride", 2)
norm = getattr(args, "subsampling_norm", "none")
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论