Commit 31e61da0 by xuchen

modify the shell scripts

parent 0e2452b9
...@@ -36,10 +36,10 @@ dataset=mustc ...@@ -36,10 +36,10 @@ dataset=mustc
task=speech_to_text task=speech_to_text
vocab_type=unigram vocab_type=unigram
vocab_size=5000 vocab_size=5000
speed_perturb=1 speed_perturb=0
org_data_dir=/media/data/${dataset} org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/asr data_dir=~/st/data/${dataset}/asr_lcrm
test_subset=tst-COMMON test_subset=tst-COMMON
# exp # exp
...@@ -51,6 +51,7 @@ exp_name= ...@@ -51,6 +51,7 @@ exp_name=
# config # config
train_config=train_ctc.yaml train_config=train_ctc.yaml
data_config=config_asr.yaml data_config=config_asr.yaml
data_config=config_st_share.yaml
# training setting # training setting
fp16=1 fp16=1
...@@ -98,6 +99,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -98,6 +99,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--data-root ${org_data_dir} --data-root ${org_data_dir}
--output-root ${data_dir} --output-root ${data_dir}
--task asr --task asr
--lowercase-src
--rm-punc-src
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then if [[ ${speed_perturb} -eq 1 ]]; then
......
...@@ -6,7 +6,7 @@ gpu_num=8 ...@@ -6,7 +6,7 @@ gpu_num=8
update_freq=2 update_freq=2
max_tokens=20000 max_tokens=20000
extra_tag= extra_tag=lcrm
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
......
...@@ -13,6 +13,7 @@ report-accuracy: True ...@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
...@@ -21,22 +22,25 @@ clip-norm: 10.0 ...@@ -21,22 +22,25 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 8000
lr: 5e-4 lr: 1e-3
adam_betas: (0.9,0.98) adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 256 encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 8
decoder-embed-dim: 256 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 8
...@@ -13,21 +13,25 @@ report-accuracy: True ...@@ -13,21 +13,25 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: dlcl_transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
optimizer: adam optimizer: adam
clip-norm: 10.0 clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 8000
lr: 5e-4 lr: 1e-3
adam_betas: (0.9,0.98) adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
train-subset: train train-subset: train
valid-subset: valid valid-subset: valid
max-epoch: 20 max-epoch: 50
max-update: 1000000 max-update: 100000
num-workers: 8 num-workers: 8
patience: 10 patience: 10
...@@ -13,6 +13,7 @@ report-accuracy: True ...@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
...@@ -21,21 +22,24 @@ clip-norm: 10.0 ...@@ -21,21 +22,24 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 8000
lr: 5e-4 lr: 1e-3
adam_betas: (0.9,0.98) adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 512 encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 8 encoder-attention-heads: 4
decoder-embed-dim: 256 decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
......
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
print(line)
...@@ -37,10 +37,11 @@ task=translation ...@@ -37,10 +37,11 @@ task=translation
vocab_type=unigram vocab_type=unigram
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
lc_rm=1
use_specific_dict=1 use_specific_dict=1
specific_prefix=st_share10k specific_prefix=st_share10k_lcrm
specific_dir=/home/xuchen/st/data/mustc/st/en-de specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
src_vocab_prefix=spm_unigram10000_st_share src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share tgt_vocab_prefix=spm_unigram10000_st_share
...@@ -48,7 +49,8 @@ org_data_dir=/media/data/${dataset} ...@@ -48,7 +49,8 @@ org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang} data_dir=~/st/data/${dataset}/mt/${lang}
train_subset=train train_subset=train
valid_subset=dev valid_subset=dev
test_subset=test test_subset=tst-COMMON
trans_set=test
# exp # exp
extra_tag= extra_tag=
...@@ -132,10 +134,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -132,10 +134,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${test_subset}; do for split in ${train_subset} ${valid_subset} ${test_subset}; do
cmd="spm_encode cmd="cat ${org_data_dir}/${lang}/data/${split}.${src_lang}"
--model ${data_dir}/${src_vocab_prefix}.model if [[ ${lc_rm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece --output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${src_lang}
> ${data_dir}/data/${split}.${src_lang}" > ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m" echo -e "\033[34mRun command: \n${cmd} \033[0m"
...@@ -311,8 +316,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -311,8 +316,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
result_file=${model_dir}/decode_result result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ }) trans_set=(${trans_set//,/ })
for subset in ${test_subset[@]}; do for subset in ${trans_set[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir} ${data_dir}
--source-lang ${src_lang} --source-lang ${src_lang}
......
...@@ -11,7 +11,7 @@ log-interval: 100 ...@@ -11,7 +11,7 @@ log-interval: 100
seed: 1 seed: 1
report-accuracy: True report-accuracy: True
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_baseline/avg_10_checkpoint.pt #load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_st_vocab/avg_10_checkpoint.pt
#load-pretrained-text-encoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt #load-pretrained-text-encoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
#load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt #load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
......
train-subset: train_st
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_st_vocab/avg_10_checkpoint.pt
load-pretrained-text-encoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: transformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
...@@ -38,10 +38,10 @@ vocab_type=unigram ...@@ -38,10 +38,10 @@ vocab_type=unigram
asr_vocab_size=5000 asr_vocab_size=5000
vocab_size=10000 vocab_size=10000
share_dict=1 share_dict=1
speed_perturb=1 speed_perturb=0
org_data_dir=/media/data/${dataset} org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/st data_dir=~/st/data/${dataset}/st_lcrm
test_subset=tst-COMMON test_subset=tst-COMMON
# exp # exp
...@@ -121,6 +121,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -121,6 +121,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output-root ${data_dir} --output-root ${data_dir}
--task st --task st
--add-src --add-src
--lowercase-src
--rm-punc-src
--cmvn-type utterance --cmvn-type utterance
--vocab-type ${vocab_type} --vocab-type ${vocab_type}
--vocab-size ${vocab_size}" --vocab-size ${vocab_size}"
......
...@@ -6,14 +6,15 @@ gpu_num=8 ...@@ -6,14 +6,15 @@ gpu_num=8
update_freq=2 update_freq=2
max_tokens=20000 max_tokens=20000
extra_tag= extra_tag=lcrm
extra_parameter= extra_parameter=
#extra_tag="${extra_tag}" #extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} " #extra_parameter="${extra_parameter} "
exp_tag= exp_tag=baseline
train_config=train_ctc.yaml train_config=train_ctc_sate.yaml
#train_config=train_ctc.yaml
cmd="./run.sh cmd="./run.sh
--stage 1 --stage 1
......
train-subset: train train-subset: train
valid-subset: valid valid-subset: valid
max-epoch: 20 max-epoch: 50
max-update: 1000000 max-update: 1000000
num-workers: 8 num-workers: 8
...@@ -13,6 +13,7 @@ report-accuracy: True ...@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from: #load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer arch: transformer
share-decoder-input-output-embed: True share-decoder-input-output-embed: True
...@@ -21,22 +22,25 @@ clip-norm: 10.0 ...@@ -21,22 +22,25 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7 warmup-init-lr: 1e-7
warmup-updates: 8000 warmup-updates: 8000
lr: 5e-4 lr: 1e-3
adam_betas: (0.9,0.98) adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy criterion: label_smoothed_cross_entropy
label_smoothing: 0.1 label_smoothing: 0.1
dropout: 0.1 dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu activation-fn: relu
encoder-normalize-before: True encoder-normalize-before: True
decoder-normalize-before: True decoder-normalize-before: True
encoder-embed-dim: 256 encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048 encoder-ffn-embed-dim: 2048
encoder-layers: 6 encoder-layers: 6
decoder-layers: 6 decoder-layers: 6
encoder-attention-heads: 4 encoder-attention-heads: 8
decoder-embed-dim: 256 decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048 decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4 decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论