Commit e7422a42 by xuchen

add yaml file for reproduction

parent cf64b587
......@@ -37,4 +37,4 @@ attention-dropout: 0.1
activation-dropout: 0.1
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
#load-pretrained-decoder-from:
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
......
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
......@@ -3,10 +3,11 @@ inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.5
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
......
arch: pdss2t_transformer_s_8
pds-fusion: True
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
activation-fn: relu
encoder-ffn-embed-dim: 2048
encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
......@@ -40,4 +40,4 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
......@@ -40,4 +40,4 @@ decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
encoder-attention-type: rel_selfattn
encoder-attention-type: rel_pos
# encoder-attention-type: relative
# max-encoder-relative-length: 100
\ No newline at end of file
# decoder-attention-type: relative
# max-encoder-relative-length: 100
# max-decoder-relative-length: 20
\ No newline at end of file
#!/usr/bin/env bash
gpu_num=0
gpu_num=1
data_dir=
data_tag=
test_subset=(dev tst-COMMON)
exp_name=
......@@ -14,8 +14,11 @@ cer=0
ctc_infer=0
n_average=10
beam_size=5
infer_ctc_weight=0.1
len_penalty=1.0
max_tokens=50000
batch_size=1
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
......@@ -28,12 +31,16 @@ cmd="./run.sh
--ctc_infer ${ctc_infer}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--ctc_infer ${ctc_infer}
--infer_ctc_weight ${infer_ctc_weight}
--infer_debug ${infer_debug}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
if [[ -n ${data_tag} ]]; then
cmd="$cmd --data_tag ${data_tag}"
fi
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
......
......@@ -12,6 +12,7 @@ wer_standardize = tr.Compose(
tr.SubstituteRegexes({r"<<unk>>": r"@"}),
tr.ToLowerCase(),
tr.RemovePunctuation(),
tr.Strip(),
tr.ExpandCommonEnglishContractions(),
tr.RemoveKaldiNonWords(),
tr.RemoveWhiteSpace(replace_by_space=True),
......
......@@ -78,10 +78,12 @@ data_config=config.yaml
# Decoding Settings
cer=0
ctc_infer=0
ctc_self_ensemble=0
ctc_inter_logit=0
batch_size=0
dec_model=checkpoint_best.pt
ctc_infer=0
infer_ctc_weight=0
n_average=10
beam_size=5
len_penalty=1.0
......@@ -113,13 +115,6 @@ if [[ "${vocab_type}" == "char" ]]; then
data_dir=${data_dir}_char
exp_prefix=${exp_prefix}_char
fi
if [[ ! -d /mnt/bd/data-model && -d /mnt/bd/data-model2 ]]; then
sudo ln -s /mnt/bd/data-model2/ /mnt/bd/data-model
fi
if [[ ! -d ${data_dir} ]]; then
echo "No feature dir ${data_dir}"
exit
fi
export PATH=$PATH:${code_dir}/scripts
. ./local/parse_options.sh || exit 1;
......@@ -300,8 +295,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log $log
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="nohup ${cmd} >> ${log} 2>&1 &"
# cmd="${cmd} 2>&1 | tee -a ${log}"
cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then
# tensorboard
port=6666
......@@ -343,34 +338,63 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
if [[ ${n_average} -ne 1 ]]; then
suffix=${suffix}_${n_average}
fi
suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str}
if [[ -n ${cer} && ${cer} -eq 1 ]]; then
suffix=${suffix}_cer
else
suffix=${suffix}_wer
fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${ctc_infer} -eq 1 ]]; then
suffix=${suffix}_ctc
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
suffix=${suffix}_ensemble
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
suffix=${suffix}_logit${ctc_inter_logit}
fi
if (( $(echo "${infer_ctc_weight} > 0" | bc -l) )); then
suffix=${suffix}_ctc${infer_ctc_weight}
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
subset=${subset}
cmd="python3 ${code_dir}/fairseq_cli/generate.py
subset=${subset}
if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir}
--config-yaml ${data_config}
--gen-subset ${subset}
--task speech_to_text
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens}
--beam ${beam_size}
--skip-invalid-size-inputs-valid-test
--infer-ctc-weight ${infer_ctc_weight}
--lenpen ${len_penalty}
--lenpen ${len_penalty}
--batch-size 1
--scoring wer
--wer-tokenizer 13a
--wer-lowercase
......@@ -385,6 +409,14 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
cmd="${cmd}
--ctc-infer"
fi
if [[ ${ctc_self_ensemble} -eq 1 ]]; then
cmd="${cmd}
--ctc-self-ensemble"
fi
if [[ ${ctc_inter_logit} -ne 0 ]]; then
cmd="${cmd}
--ctc-inter-logit ${ctc_inter_logit}"
fi
if [[ ${infer_score} -eq 1 ]]; then
cmd="${cmd}
--score-reference"
......
......@@ -17,8 +17,8 @@ eval-bleu-print-samples: True
best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True
no-epoch-checkpoints: True
#keep-last-epochs: 10
# no-epoch-checkpoints: True
keep-last-epochs: 1
keep-best-checkpoints: 10
num-workers: 8
......
......@@ -3,20 +3,6 @@ criterion: label_smoothed_cross_entropy_with_ctc
# ctc-layer: 6
ctc-weight: 0.3
interleaved-ctc-weight: 0.2
interleaved-ctc-layers: 4
sae-ctc-temperature: 1
interleaved-ctc-drop-prob: 0
ctc-upsampling-ratio: 3
ctc-out-downsampling: False
ctc-out-downsampling-method: maxpooling
share-interleaved-ctc: True
sae-adapter: inter_league
sae-drop-prob: 0.0
# sae-distribution-cutoff: 10
#share-ctc-and-sae: True
#sae-ground-truth-ratio: 0.3
#ctc-self-distill-weight: 1
ctc-out-downsampling-method: maxpooling
\ No newline at end of file
arch: transformer_ctc
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 64
encoder-ffn-embed-dim: 64
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 64
decoder-ffn-embed-dim: 64
decoder-attention-heads: 4
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
#ctc-layer:
#ctc-weight: 0.2
interleaved-ctc-weight: 0.3
interleaved-ctc-layers: 6,9
sae-ctc-temperature: 1.0
interleaved-ctc-drop-prob: 0
interleaved-ctc-upsampling-ratio: 3
sae-adapter: league
sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
share-ctc-and-sae: True
sae-ground-truth-ratio: 0.3
ctc-self-distill-weight: 0
\ No newline at end of file
......@@ -9,7 +9,7 @@ share_dict=1
lcrm=0
tokenizer=0
data_dir=
data_tag=
test_subset=(valid test)
exp_name=
......@@ -22,6 +22,8 @@ n_average=10
beam_size=5
len_penalty=1.0
max_tokens=50000
batch_size=1
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
......@@ -38,16 +40,18 @@ cmd="./run.sh
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--infer_debug ${infer_debug}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
if [[ -n ${data_tag} ]]; then
cmd="$cmd --data_tag ${data_tag}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
......
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
line = line.replace(" ", "")
print(line)
#!/usr/bin/env bash
gpu_num=4
cmd="sh train.sh"
......
......@@ -14,7 +14,7 @@ get_devices(){
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
......
#./run.sh --stage 0 --stop_stage 0 --tgt_lang fr
./run.sh --stage 0 --stop_stage 0 --tgt_lang es
./run.sh --stage 0 --stop_stage 0 --tgt_lang de-v2
#./run.sh --stage 0 --stop_stage 0 --tgt_lang ja
......@@ -84,6 +84,7 @@ dec_model=checkpoint_best.pt
n_average=10
beam_size=5
len_penalty=1.0
infer_debug=0
infer_score=0
# infer_parameters="--cal-monotonic-cross-attn-weights --cal-localness --localness-window 0.1 --cal-topk-cross-attn-weights --topk-cross-attn-weights 15 --cal-entropy"
......@@ -336,14 +337,14 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo -e "\033[34mRun command: \n${cmd} \033[0m"
# save info
log=./history.log
log=${ckpt_dir}/history.log
echo "${time} | ${data_dir} | ${exp_name} | ${model_dir} " >> $log
tail -n 50 ${log} > tmp.log
mv tmp.log $log
log=${model_dir}/train.log
cmd="${cmd} 2>&1 | tee -a ${log}"
#cmd="nohup ${cmd} >> ${log} 2>&1 &"
# cmd="${cmd} 2>&1 | tee -a ${log}"
cmd="${cmd} >> ${log} 2>&1 "
if [[ $eval -eq 1 ]]; then
# tensorboard
port=6666
......@@ -384,24 +385,37 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
export CUDA_VISIBLE_DEVICES=${device}
fi
suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
if [[ ${n_average} -ne 1 ]]; then
suffix=${suffix}_${n_average}
fi
suffix=alpha${len_penalty}
model_str=`echo $dec_model | sed -e "s#checkpoint##" | sed "s#.pt##"`
suffix=${suffix}_${model_str}
if [[ ${sacrebleu} -eq 1 ]]; then
suffix=${suffix}_sacrebleu
else
suffix=${suffix}_multibleu
fi
suffix=${suffix}_beam${beam_size}
if [[ ${batch_size} -ne 0 ]]; then
suffix=${suffix}_batch${batch_size}
else
suffix=${suffix}_tokens${max_tokens}
fi
if [[ ${infer_score} -eq 1 ]]; then
suffix=${suffix}_score
fi
suffix=`echo $suffix | sed -e "s#__#_#"`
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
for subset in ${test_subset[@]}; do
cmd="python3 ${code_dir}/fairseq_cli/generate.py
subset=${subset}
if [[ ${infer_debug} -ne 0 ]]; then
cmd="python3 -m debugpy --listen 0.0.0.0:5678 --wait-for-client"
else
cmd="python3 "
fi
cmd="$cmd ${code_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
--target-lang ${tgt_lang}
......@@ -409,9 +423,10 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
--task ${task}
--path ${model_dir}/${dec_model}
--results-path ${model_dir}
--batch-size ${batch_size}
--max-tokens ${max_tokens}
--beam ${beam_size}
--batch-size 1
--skip-invalid-size-inputs-valid-test
--lenpen ${len_penalty}
--post-process sentencepiece"
......
......@@ -14,7 +14,6 @@ extra_parameter=
exp_tag=baseline
config_list=(small)
config_list=(small inter)
# exp full name
exp_name=
......@@ -23,7 +22,7 @@ train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 4
--stop_stage 2
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
......
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
\ No newline at end of file
......@@ -26,4 +26,4 @@ no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
skip-invalid-size-inputs-valid-test: True
\ No newline at end of file
......@@ -48,7 +48,6 @@ pds-layers: 3_3_3_3
pds-ratios: 2_2_1_2
pds-fusion: False
pds-fusion-method: all_conv2
pds-dropout: 0
pds-fusion-layers: 0_1_1_1
pds-fusion-weight: 0.2_0.3_0.5
pds-embed-dims: 512_512_512_512
......
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: none
# ctc-pae: inter_league
......
encoder-attention-type: local
hard-mask-window: 0
gauss-mask-sigma: 3
init-mask-weight: 0
\ No newline at end of file
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: False
inter-mixup-decoder-emb: False
ctc-mixup-consistent-weight: 0
inter-ctc-mixup-consistent-weight: 0
mixup-consistent-weight: 0
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
......
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
encoder-normalize-before: True
decoder-normalize-before: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 0
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
acoustic-encoder: transformer
adapter: none
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
cal-mixup-loss: True
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
share-ctc-and-embed: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
# Append-based Interpolation Augmentation
inter-mixup: True
inter-mixup-layer: -1
inter-mixup-decoder-layer: 0
inter-mixup-prob: 1.0
inter-mixup-ratio: 1.0
inter-mixup-beta: 0.2
inter-mixup-keep-org: True
inter-mixup-decoder-emb: True
cal-mixup-loss: False
no-specaugment: False
layer-out-norm: False
inter-mixup-ratio-decay: False
inter-mixup-ratio-decay-params: 20000,40000,0
# MTL
ctc-weight: 0.3
share-ctc-and-embed: True
inter-ctc-weight: 0.2
inter-ctc-layers: 6,9
share-inter-ctc: True
ctc-pae: inter_league
pae-unnorm-input: True
ctc-mixup-consistent-weight: 0.15
inter-ctc-mixup-consistent-weight: 0.1
mixup-consistent-weight: 0.5
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
layer-padding-mask: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
ctc-layer: 12
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 16
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_transformer_m
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 18
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# Bilingual CTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.1
# InterCTC
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9,12,15
share-inter-ctc: True
inter-xctc-weight: 0.05
inter-xctc-layers: 6,9,12,15
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
pae-unnorm-input: True
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.1
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# CTC & XCTC
share-ctc-and-embed: True
share-xctc-and-embed: True
ctc-weight: 0.2
xctc-weight: 0.2
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 0.1
inter-ctc-layers: 6,9
inter-xctc-weight: 0.1
inter-xctc-layers: 4
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 3
cross-attn-layer: 2
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.5
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
arch: s2t_ctc
encoder-type: sate
criterion: ctc
zero_infinity: True
xctc-weight: 1.0
ctc-weight: 1.0
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
lr: 1e-3
adam_betas: (0.9,0.98)
criterion: ctc
zero_infinity: True
ctc-weight: 1.0
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: True
text-no-pos-emb: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-filter: 2048
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
dropout: 0.15
activation-fn: relu
encoder-embed-dim: 256
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
encoder-attention-heads: 8
acoustic-encoder: transformer
adapter: inter_league
#adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
# Conformer
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 15
encoder-attention-type: rel_pos
encoder-activation-fn: swish
# InterCTC
share-inter-ctc: True
inter-ctc-weight: 1.0
inter-ctc-layers: 6,9
inter-xctc-weight: 1.0
inter-xctc-layers: 6,9
# Prediction-aware encoding
ctc-pae: inter_league
xctc-pae: inter_league
# Cross-layer attn
xctc-cross-attn: True
cross-attn-start-layer: 4
cross-attn-layer: 3
cross-attn-collaboration-mode: serial
cross-attn-league-drop-net: True
cross-attn-league-drop-net-prob: 0.1
# Curriculum learning mixing
xctc-pae-ground-truth-ratio: 0.8
xctc-pae-ground-truth-only-mistake: True
pae-oracle-smooth: True
encoder-attention-type: rel_pos
#encoder-attention-type: rel_pos_legacy
#encoder-attention-type: rel_selfattn
#encoder-attention-type: relative
#decoder-attention-type: relative
#max-encoder-relative-length: 100
#max-decoder-relative-length: 20
# encoder-attention-type: relative
# decoder-attention-type: relative
# max-encoder-relative-length: 100
# max-decoder-relative-length: 20
\ No newline at end of file
arch: s2t_sate
encoder-embed-norm: True
encoder-no-scale-embedding: True
textual-encoder-embed-norm: False
textual-encoder-no-scale-embedding: True
text-no-pos-emb: True
encoder-normalize-before: True
decoder-normalize-before: True
text-encoder-layers: 0
acoustic-encoder: pds
adapter: none
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
# ctc-weight: 0.3
share-ctc-and-embed: True
share-xctc-and-embed: True
share-inter-ctc: True
xctc-weight: 0.3
# xctc-layer: 12
# inter-ctc-weight: 0.2
# inter-ctc-layers: 6,9
# inter-xctc-weight: 0.2
# inter-xctc-layers: 6,9
ctc-pae: none
# xctc-pae: none
xctc-cross-attn: False
cross-attn-start-layer: 7
cross-attn-layer: 6
cross-attn-collaboration-mode: parallel
cross-attn-league-s1-ratio: 0.5
cross-attn-league-s2-ratio: 0.5
cross-attn-league-out-norm: False
cross-attn-league-gated: False
cross-attn-league-drop-net: False
cross-attn-league-drop-net-prob: 0.2
cross-attn-league-drop-net-mix: False
# ctc-pae-ground-truth-ratio: 0.3
# xctc-pae-ground-truth-ratio: 0.3
# adapter-pae-ground-truth-ratio: 0.3
# pae-ctc-temperature: 1
# adapter-temperature: 1
#pae-gumbel: True
#pae-distribution-hard: True
#pae-drop-prob: 0.0
#pae-distribution-cutoff: 10
#share-pae-and-ctc: True
#share-pae-and-xctc: True
#pae-embed-norm: True
#pae-out-norm: True
#ctc-self-distill-weight: 1
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
use-additional-ctc-text: True
\ No newline at end of file
# ctc-weight: 0.3
share-ctc-and-embed: True
share-xctc-and-embed: True
share-inter-ctc: True
xctc-weight: 0.3
xctc-layer: 12
axctc-weight: 0.3
axctc-layer: 6
inter-ctc-weight: 0.2
# inter-ctc-layers: 6,9
inter-xctc-weight: 0.2
# inter-xctc-layers: 10
inter-axctc-weight: 0.2
# inter-axctc-layers: 4
ctc-pae: none
# xctc-pae: none
# axctc-pae: none
xctc-cross-attn: False
cross-attn-start-layer: 7
cross-attn-layer: 6
cross-attn-collaboration-mode: parallel
cross-attn-league-s1-ratio: 0.5
cross-attn-league-s2-ratio: 0.5
cross-attn-league-out-norm: False
cross-attn-league-gated: False
cross-attn-league-drop-net: False
cross-attn-league-drop-net-prob: 0.2
cross-attn-league-drop-net-mix: False
# ctc-pae-ground-truth-ratio: 0.3
# axctc-pae-ground-truth-ratio: 0.3
# xctc-pae-ground-truth-ratio: 0.3
# adapter-pae-ground-truth-ratio: 0.3
# pae-ctc-temperature: 1
# adapter-temperature: 1
#pae-gumbel: True
#pae-distribution-hard: True
#pae-drop-prob: 0.0
#pae-distribution-cutoff: 10
#share-pae-and-ctc: True
#share-pae-and-xctc: True
#pae-embed-norm: True
#pae-out-norm: True
#ctc-self-distill-weight: 1
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
use-additional-ctc-text: True
pds-fusion-method: none
\ No newline at end of file
xctc-weight: 0.3
share-xctc-and-embed: True
# inter-xctc-weight: 0.2
# inter-xctc-layers: 6,9
xctc-pae: none
\ No newline at end of file
share-xctc-and-embed: True
\ No newline at end of file
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
textual-encoder-embed-norm: True
textual-encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 1024
subsampling-kernel: 5
subsampling-stride: 2
subsampling-norm: none
subsampling-activation: glu
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 0
text-encoder-layers: 12
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
text-use-s2t-layer: False
acoustic-encoder: transformer
# adapter: inter_league
adapter: none
#adapter-embed-norm: True
#adapter-out-norm: True
#share-adapter-and-ctc: True
#share-adapter-and-embed: True
#load-pretrained-encoder-from:
#load-pretrained-acoustic-encoder-from:
#load-pretrained-text-encoder-from:
#load-pretrained-decoder-from:
inter-xctc-weight: 0.2
#inter-xctc-layers: 4
inter-xctc-layers: 6,9
xctc-pae: none
xctc-pae: none
# xctc-pae: inter_league
xctc-cross-attn: False
......
......@@ -11,13 +11,13 @@ if [ "$#" -eq 1 ]; then
fi
sacrebleu=1
ctc_infer=1
ctc_infer=0
n_average=10
beam_size=5
infer_ctc_weight=0.1
infer_ctc_weight=0
len_penalty=1.0
max_tokens=50000
batch_size=1
batch_size=0
infer_debug=0
dec_model=checkpoint_best.pt
......
dir=/xuchen/st/checkpoints/must_c/en-de/st/JointCTC/big
tag=JointCTC/big
for d in `ls $dir`; do
echo $d
./run.sh --stage 2 --max_tokens 10000 --batch_size 1 --ctc_infer 1 --infer_ctc_weight 0.1 --exp_name $tag/$d
./run.sh --stage 2 --max_tokens 10000 --batch_size 1 --ctc_infer 1 --infer_ctc_weight 0.2 --exp_name $tag/$d
./run.sh --stage 2 --max_tokens 10000 --batch_size 1 --ctc_infer 1 --infer_ctc_weight 0.3 --exp_name $tag/$d
./run.sh --stage 2 --max_tokens 10000 --batch_size 1 --ctc_infer 1 --infer_ctc_weight 0.4 --exp_name $tag/$d
./run.sh --stage 2 --max_tokens 10000 --batch_size 1 --ctc_infer 1 --infer_ctc_weight 0.5 --exp_name $tag/$d
done
\ No newline at end of file
./run.sh --stage 2 --tgt_lang fr --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang es --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang it --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang nl --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang pt --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang ro --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 2 --tgt_lang ru --infer_ctc_weight 0.1 --batch_size 1 --exp_name big_conformer_ctc_inter_xctc_xinter_enc18_wsum0.45_unnormpae_oracle0.1
./run.sh --stage 0 --stop_stage 0 --tgt_lang es
./run.sh --stage 0 --stop_stage 0 --tgt_lang fr
./run.sh --stage 0 --stop_stage 0 --tgt_lang it
./run.sh --stage 0 --stop_stage 0 --tgt_lang nl
./run.sh --stage 0 --stop_stage 0 --tgt_lang pt
./run.sh --stage 0 --stop_stage 0 --tgt_lang ro
./run.sh --stage 0 --stop_stage 0 --tgt_lang ru
......@@ -14,23 +14,13 @@ extra_parameter=
exp_tag=
# Base
# config_list=(base dynamic ctc)
# config_list=(multibranch ctc)
#config_list=(base mixup ctc)
#config_list=(base conformer ctc)
# SATE
config_list=(sate ctc)
# config_list=(sate inter ctc)
#config_list=(sate conformer ctc)
# SAE
#config_list=(sate inter)
config_list=(sate conformer ctc)
# PDS
#config_list=(pds_base_8 ctc)
#config_list=(pds_base_8 conformer ctc)
#config_list=(sate_pds ctc)
# exp full name
exp_name=
......
......@@ -17,9 +17,9 @@ eval-bleu-print-samples: True
best_checkpoint_metric: bleu
maximize_best_checkpoint_metric: True
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 5
# no-epoch-checkpoints: True
keep-last-epochs: 1
keep-best-checkpoints: 10
num-workers: 8
no-progress-bar: True
......
......@@ -3,20 +3,6 @@ criterion: label_smoothed_cross_entropy_with_ctc
# ctc-layer: 6
ctc-weight: 0.3
interleaved-ctc-weight: 0.2
interleaved-ctc-layers: 4
sae-ctc-temperature: 1.0
interleaved-ctc-drop-prob: 0
ctc-upsampling-ratio: 3
ctc-out-downsampling: False
ctc-out-downsampling-method: maxpooling
share-interleaved-ctc: True
sae-adapter: inter_league
sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
# share-ctc-and-sae: True
# share-ctc-and-embed: True
ctc-self-distill-weight: 0
......@@ -15,6 +15,8 @@ n_average=5
beam_size=4
len_penalty=0.6
max_tokens=4000
batch_size=1
infer_debug=0
dec_model=checkpoint_best.pt
cmd="./run.sh
......@@ -26,16 +28,18 @@ cmd="./run.sh
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--batch_size ${batch_size}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
--infer_debug ${infer_debug}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
if [[ ${#test_subset[@]} -ne 0 ]]; then
subsets=$(echo ${test_subset[*]} | sed 's/ /,/g')
cmd="$cmd --test_subset ${subsets}"
fi
echo $cmd
......
......@@ -22,7 +22,7 @@ train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--stop_stage 2
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
......
set -e
eval=1
lcrm=0
src_lang=en
tgt_lang=zh
tokenize=1
splits=(tst-COMMON test11)
dataset=wmt20
root_dir=~/st/Fairseq-S2T
data_dir=/home/xuchen/st/data/$dataset/data
vocab_dir=/home/xuchen/st/data/$dataset/mt/unigram32000_tok
dest_dir=$vocab_dir
src_vocab_prefix=spm_unigram32000_en
tgt_vocab_prefix=spm_unigram32000_zh
for split in ${splits[@]}; do
src_file=${data_dir}/${split}/${split}.${src_lang}
tgt_file=${data_dir}/${split}/${split}.${tgt_lang}
if [[ ${tokenize} -eq 1 ]]; then
src_tok_file=${data_dir}/${split}.tok/${split}.tok.${src_lang}
tgt_tok_file=${data_dir}/${split}.tok/${split}.tok.${tgt_lang}
if [[ ! -f ${src_tok_file} ]]; then
cmd="tokenizer.perl -l ${src_lang} --threads 8 -no-escape < ${src_file} > ${src_tok_file}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
if [[ ! -f ${tgt_tok_file} ]]; then
cmd="tokenizer.perl -l ${tgt_lang} --threads 8 -no-escape < ${tgt_file} > ${tgt_tok_file}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
fi
src_file=${src_tok_file}
tgt_file=${tgt_tok_file}
fi
cmd="cat ${src_file}"
if [[ ${lcrm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${src_file}"
fi
cmd="${cmd}
| spm_encode --model ${vocab_dir}/${src_vocab_prefix}.model
--output_format=piece
> ${src_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="spm_encode
--model ${vocab_dir}/${tgt_vocab_prefix}.model
--output_format=piece
< ${tgt_file}
> ${tgt_file}.spm"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
src_file=${src_file}.spm
tgt_file=${tgt_file}.spm
mkdir -p ${dest_dir}/final
cmd="cp ${src_file} ${dest_dir}/final/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
cmd="cp ${tgt_file} ${dest_dir}/final/${split}.${tgt_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
done
n_set=${#splits[*]}
for ((i=0;i<$n_set;i++)); do
dataset[$i]=${dest_dir}/final/${splits[$i]}
done
pref=`echo ${dataset[*]} | sed 's/ /,/g'`
cmd="python ${root_dir}/fairseq_cli/preprocess.py
--source-lang ${src_lang}
--target-lang ${tgt_lang}
--testpref ${pref}
--destdir ${dest_dir}/data-bin
--srcdict ${vocab_dir}/${src_vocab_prefix}.txt
--tgtdict ${vocab_dir}/${tgt_vocab_prefix}.txt
--workers 64"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
[[ $eval -eq 1 ]] && eval ${cmd}
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 7e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 20
max-update: 100000
patience: 5
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
no-epoch-checkpoints: True
#keep-last-epochs: 10
keep-best-checkpoints: 5
num-workers: 8
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
max-source-positions: 512
arch: transformer_wmt_en_de_big_t2t
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 7e-4
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 1024
encoder-ffn-embed-dim: 4096
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 16
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer_wmt_en_de_big
share-all-embeddings: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 4000
lr: 5e-4
adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.3
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: False
decoder-normalize-before: False
encoder-embed-dim: 1024
encoder-ffn-embed-dim: 4096
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 16
decoder-embed-dim: 1024
decoder-ffn-embed-dim: 4096
decoder-attention-heads: 16
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 30
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
\ No newline at end of file
use-enc-dlcl: True
use-dec-dlcl: True
#ctc-weight: 0.2
intermedia-ctc-weight: 0.3
intermedia-ctc-layers: 10,20
#target-ctc-weight: 0.3
#target-ctc-layer: 6
#target-intermedia-ctc-weight: 0.1
#target-intermedia-ctc-layers: 2,4
intermedia-adapter: league
#intermedia-drop-prob: 0.2
#intermedia-temperature: 5
post-process: sentencepiece
\ No newline at end of file
encoder-attention-type: relative
decoder-attention-type: relative
max-encoder-relative-length: 8
max-decoder-relative-length: 8
#!/usr/bin/env bash
gpu_num=1
data_dir=
test_subset=(test)
exp_name=
if [ "$#" -eq 1 ]; then
exp_name=$1
fi
sacrebleu=0
n_average=5
beam_size=4
len_penalty=0.6
max_tokens=80000
dec_model=checkpoint_best.pt
cmd="./run.sh
--stage 2
--stop_stage 2
--gpu_num ${gpu_num}
--exp_name ${exp_name}
--sacrebleu ${sacrebleu}
--n_average ${n_average}
--beam_size ${beam_size}
--len_penalty ${len_penalty}
--max_tokens ${max_tokens}
--dec_model ${dec_model}
"
if [[ -n ${data_dir} ]]; then
cmd="$cmd --data_dir ${data_dir}"
fi
if [[ -n ${test_subset} ]]; then
test_subset=`echo ${test_subset[*]} | sed 's/ /,/g'`
cmd="$cmd --test_subset ${test_subset}"
fi
echo $cmd
eval $cmd
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
if w != "'":
line = line.replace(w, "")
line = line.replace(" ", " ")
print(line)
gpu_num=4
cmd="sh train.sh"
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
sleep 60s
else
echo "Run $cmd"
eval $cmd
sleep 10s
exit
fi
done
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# $Id$
use warnings;
use strict;
my $lowercase = 0;
if ($ARGV[0] eq "-lc") {
$lowercase = 1;
shift;
}
my $stem = $ARGV[0];
if (!defined $stem) {
print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
print STDERR "Reads the references from reference or reference0, reference1, ...\n";
exit(1);
}
$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
my @REF;
my $ref=0;
while(-e "$stem$ref") {
&add_to_ref("$stem$ref",\@REF);
$ref++;
}
&add_to_ref($stem,\@REF) if -e $stem;
die("ERROR: could not find reference file $stem") unless scalar @REF;
# add additional references explicitly specified on the command line
shift;
foreach my $stem (@ARGV) {
&add_to_ref($stem,\@REF) if -e $stem;
}
sub add_to_ref {
my ($file,$REF) = @_;
my $s=0;
if ($file =~ /.gz$/) {
open(REF,"gzip -dc $file|") or die "Can't read $file";
} else {
open(REF,$file) or die "Can't read $file";
}
while(<REF>) {
chop;
push @{$$REF[$s++]}, $_;
}
close(REF);
}
my(@CORRECT,@TOTAL,$length_translation,$length_reference);
my $s=0;
while(<STDIN>) {
chop;
$_ = lc if $lowercase;
my @WORD = split;
my %REF_NGRAM = ();
my $length_translation_this_sentence = scalar(@WORD);
my ($closest_diff,$closest_length) = (9999,9999);
foreach my $reference (@{$REF[$s]}) {
# print "$s $_ <=> $reference\n";
$reference = lc($reference) if $lowercase;
my @WORD = split(' ',$reference);
my $length = scalar(@WORD);
my $diff = abs($length_translation_this_sentence-$length);
if ($diff < $closest_diff) {
$closest_diff = $diff;
$closest_length = $length;
# print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
} elsif ($diff == $closest_diff) {
$closest_length = $length if $length < $closest_length;
# from two references with the same closeness to me
# take the *shorter* into account, not the "first" one.
}
for(my $n=1;$n<=4;$n++) {
my %REF_NGRAM_N = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$REF_NGRAM_N{$ngram}++;
}
foreach my $ngram (keys %REF_NGRAM_N) {
if (!defined($REF_NGRAM{$ngram}) ||
$REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
$REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
# print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
}
}
}
}
$length_translation += $length_translation_this_sentence;
$length_reference += $closest_length;
for(my $n=1;$n<=4;$n++) {
my %T_NGRAM = ();
for(my $start=0;$start<=$#WORD-($n-1);$start++) {
my $ngram = "$n";
for(my $w=0;$w<$n;$w++) {
$ngram .= " ".$WORD[$start+$w];
}
$T_NGRAM{$ngram}++;
}
foreach my $ngram (keys %T_NGRAM) {
$ngram =~ /^(\d+) /;
my $n = $1;
# my $corr = 0;
# print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
$TOTAL[$n] += $T_NGRAM{$ngram};
if (defined($REF_NGRAM{$ngram})) {
if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
$CORRECT[$n] += $T_NGRAM{$ngram};
# $corr = $T_NGRAM{$ngram};
# print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
}
else {
$CORRECT[$n] += $REF_NGRAM{$ngram};
# $corr = $REF_NGRAM{$ngram};
# print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
}
}
# $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
# print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
}
}
$s++;
}
my $brevity_penalty = 1;
my $bleu = 0;
my @bleu=();
for(my $n=1;$n<=4;$n++) {
if (defined ($TOTAL[$n])){
$bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
# print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
}else{
$bleu[$n]=0;
}
}
if ($length_reference==0){
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
exit(1);
}
if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
}
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
my_log( $bleu[2] ) +
my_log( $bleu[3] ) +
my_log( $bleu[4] ) ) / 4) ;
printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
100*$bleu,
100*$bleu[1],
100*$bleu[2],
100*$bleu[3],
100*$bleu[4],
$brevity_penalty,
$length_translation / $length_reference,
$length_translation,
$length_reference;
sub my_log {
return -9999999999 unless $_[0];
return log($_[0]);
}
#!/usr/bin/env bash
# Copyright 2012 Johns Hopkins University (Author: Daniel Povey);
# Arnab Ghoshal, Karel Vesely
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# Parse command-line options.
# To be sourced by another script (as in ". parse_options.sh").
# Option format is: --option-name arg
# and shell variable "option_name" gets set to value "arg."
# The exception is --help, which takes no arguments, but prints the
# $help_message variable (if defined).
###
### The --config file options have lower priority to command line
### options, so we need to import them first...
###
# Now import all the configs specified by command-line, in left-to-right order
for ((argpos=1; argpos<$#; argpos++)); do
if [ "${!argpos}" == "--config" ]; then
argpos_plus1=$((argpos+1))
config=${!argpos_plus1}
[ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
. $config # source the config file.
fi
done
###
### Now we process the command line options
###
while true; do
[ -z "${1:-}" ] && break; # break if there are no arguments
case "$1" in
# If the enclosing script is called with --help option, print the help
# message and exit. Scripts should put help messages in $help_message
--help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
else printf "$help_message\n" 1>&2 ; fi;
exit 0 ;;
--*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
exit 1 ;;
# If the first command-line argument begins with "--" (e.g. --foo-bar),
# then work out the variable name as $name, which will equal "foo_bar".
--*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
# Next we test whether the variable in question is undefned-- if so it's
# an invalid option and we die. Note: $0 evaluates to the name of the
# enclosing script.
# The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
# is undefined. We then have to wrap this test inside "eval" because
# foo_bar is itself inside a variable ($name).
eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
oldval="`eval echo \\$$name`";
# Work out whether we seem to be expecting a Boolean argument.
if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
was_bool=true;
else
was_bool=false;
fi
# Set the variable to the right value-- the escaped quotes make it work if
# the option had spaces, like --cmd "queue.pl -sync y"
eval $name=\"$2\";
# Check that Boolean-valued arguments are really Boolean.
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
exit 1;
fi
shift 2;
;;
*) break;
esac
done
# Check for an empty argument to the --cmd option, which can easily occur as a
# result of scripting errors.
[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
true; # so this script returns exit code 0.
#!/usr/bin/env perl
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
use warnings;
use strict;
#binmode(STDIN, ":utf8");
#binmode(STDOUT, ":utf8");
while(<STDIN>) {
s/,/,/g;
s/。 */. /g;
s/、/,/g;
s/”/"/g;
s/“/"/g;
s/∶/:/g;
s/:/:/g;
s/?/\?/g;
s/《/"/g;
s/》/"/g;
s/)/\)/g;
s/!/\!/g;
s/(/\(/g;
s/;/;/g;
s/1/"/g;
s/」/"/g;
s/「/"/g;
s/0/0/g;
s/3/3/g;
s/2/2/g;
s/5/5/g;
s/6/6/g;
s/9/9/g;
s/7/7/g;
s/8/8/g;
s/4/4/g;
s/. */. /g;
s/~/\~/g;
s/’/\'/g;
s/…/\.\.\./g;
s/━/\-/g;
s/〈/\</g;
s/〉/\>/g;
s/【/\[/g;
s/】/\]/g;
s/%/\%/g;
print $_;
}
get_devices(){
gpu_num=$1
use_cpu=$2
device=()
while :
do
record=$(mktemp -t temp.record.XXXXXX)
gpustat > $record
all_devices=$(seq 0 "$(sed '1,2d' ${record} | wc -l)");
count=0
for dev in ${all_devices[@]}
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
break
fi
fi
done
if [[ ${#device[@]} -lt $gpu_num ]]; then
if [[ $use_cpu -eq 1 ]]; then
device=(-1)
else
sleep 60s
fi
else
break
fi
done
echo ${device[*]} | sed 's/ /,/g'
return $?
}
#!/usr/bin/env bash
# calculate wmt14 en-de multi-bleu score
if [ $# -ne 1 ]; then
echo "usage: $0 GENERATE_PY_OUTPUT"
exit 1
fi
echo -e "\n RUN >> "$0
requirement_scripts=(detokenizer.perl replace-unicode-punctuation.perl tokenizer.perl multi-bleu.perl)
for script in ${requirement_scripts[@]}; do
if ! which ${script} > /dev/null; then
echo "Error: it seems that moses is not installed or exported int the environment variables." >&2
return 1
fi
done
detokenizer=detokenizer.perl
replace_unicode_punctuation=replace-unicode-punctuation.perl
tokenizer=tokenizer.perl
multi_bleu=multi-bleu.perl
GEN=$1
SYS=$GEN.sys
REF=$GEN.ref
cat $GEN | cut -f 3 > $REF
cat $GEN | cut -f 4 > $SYS
#detokenize the decodes file to format the manner to do tokenize
$detokenizer -l de < $SYS > $SYS.dtk
$detokenizer -l de < $REF > $REF.dtk
#replace unicode
$replace_unicode_punctuation -l de < $SYS.dtk > $SYS.dtk.punc
$replace_unicode_punctuation -l de < $REF.dtk > $REF.dtk.punc
#tokenize the decodes file by moses tokenizer.perl
$tokenizer -l de < $SYS.dtk.punc > $SYS.dtk.punc.tok
$tokenizer -l de < $REF.dtk.punc > $REF.dtk.punc.tok
#"rich-text format" --> rich ##AT##-##AT## text format.
perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $SYS.dtk.punc.tok > $SYS.dtk.punc.tok.atat
perl -ple 's{(\S)-(\S)}{$1 ##AT##-##AT## $2}g' < $REF.dtk.punc.tok > $REF.dtk.punc.tok.atat
$multi_bleu $REF.dtk.punc.tok.atat < $SYS.dtk.punc.tok.atat
rm -f $SYS.dtk $SYS.dtk.punc $SYS.dtk.punc.tok $REF.dtk $REF.dtk.punc $REF.dtk.punc.tok
\ No newline at end of file
#!/usr/bin/env bash
# training the model
gpu_num=4
update_freq=4
max_tokens=8192
exp_tag=baseline
#config_list=(base)
config_list=(deep)
# exp full name
exp_name=
extra_tag=
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
train_config=$(echo ${config_list[*]} | sed 's/ /,/g')
cmd="./run.sh
--stage 1
--stop_stage 1
--gpu_num ${gpu_num}
--update_freq ${update_freq}
--train_config ${train_config}
--max_tokens ${max_tokens}
"
if [[ -n ${exp_name} ]]; then
cmd="$cmd --exp_name ${exp_name}"
fi
if [[ -n ${exp_tag} ]]; then
cmd="$cmd --exp_tag ${exp_tag}"
fi
if [[ -n ${extra_tag} ]]; then
cmd="$cmd --extra_tag ${extra_tag}"
fi
if [[ -n ${extra_parameter} ]]; then
cmd="$cmd --extra_parameter \"${extra_parameter}\""
fi
echo ${cmd}
eval ${cmd}
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论