Commit 31e61da0 by xuchen

modify the shell scripts

parent 0e2452b9
......@@ -36,10 +36,10 @@ dataset=mustc
task=speech_to_text
vocab_type=unigram
vocab_size=5000
speed_perturb=1
speed_perturb=0
org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/asr
data_dir=~/st/data/${dataset}/asr_lcrm
test_subset=tst-COMMON
# exp
......@@ -51,6 +51,7 @@ exp_name=
# config
train_config=train_ctc.yaml
data_config=config_asr.yaml
data_config=config_st_share.yaml
# training setting
fp16=1
......@@ -98,6 +99,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--data-root ${org_data_dir}
--output-root ${data_dir}
--task asr
--lowercase-src
--rm-punc-src
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
if [[ ${speed_perturb} -eq 1 ]]; then
......
......@@ -6,7 +6,7 @@ gpu_num=8
update_freq=2
max_tokens=20000
extra_tag=
extra_tag=lcrm
extra_parameter=
#extra_tag="${extra_tag}"
......
......@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
......@@ -21,22 +22,25 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 5e-4
adam_betas: (0.9,0.98)
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
encoder-attention-heads: 8
decoder-embed-dim: 256
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 8
......@@ -13,21 +13,25 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 5e-4
adam_betas: (0.9,0.98)
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
......
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
train-subset: train
valid-subset: valid
max-epoch: 20
max-update: 1000000
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
......@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
......@@ -21,21 +22,24 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 5e-4
adam_betas: (0.9,0.98)
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
......
import sys
import string
in_file = sys.argv[1]
with open(in_file, "r", encoding="utf-8") as f:
for line in f.readlines():
line = line.strip().lower()
for w in string.punctuation:
line = line.replace(w, "")
print(line)
......@@ -37,10 +37,11 @@ task=translation
vocab_type=unigram
vocab_size=10000
share_dict=1
lc_rm=1
use_specific_dict=1
specific_prefix=st_share10k
specific_dir=/home/xuchen/st/data/mustc/st/en-de
specific_prefix=st_share10k_lcrm
specific_dir=/home/xuchen/st/data/mustc/st_lcrm/en-de
src_vocab_prefix=spm_unigram10000_st_share
tgt_vocab_prefix=spm_unigram10000_st_share
......@@ -48,7 +49,8 @@ org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/mt/${lang}
train_subset=train
valid_subset=dev
test_subset=test
test_subset=tst-COMMON
trans_set=test
# exp
extra_tag=
......@@ -132,10 +134,13 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
mkdir -p ${data_dir}/data
for split in ${train_subset} ${valid_subset} ${test_subset}; do
cmd="spm_encode
--model ${data_dir}/${src_vocab_prefix}.model
cmd="cat ${org_data_dir}/${lang}/data/${split}.${src_lang}"
if [[ ${lc_rm} -eq 1 ]]; then
cmd="python local/lower_rm.py ${org_data_dir}/${lang}/data/${split}.${src_lang}"
fi
cmd="${cmd}
| spm_encode --model ${data_dir}/${src_vocab_prefix}.model
--output_format=piece
< ${org_data_dir}/${lang}/data/${split}.${src_lang}
> ${data_dir}/data/${split}.${src_lang}"
echo -e "\033[34mRun command: \n${cmd} \033[0m"
......@@ -311,8 +316,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
result_file=${model_dir}/decode_result
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=(${test_subset//,/ })
for subset in ${test_subset[@]}; do
trans_set=(${trans_set//,/ })
for subset in ${trans_set[@]}; do
cmd="python ${root_dir}/fairseq_cli/generate.py
${data_dir}
--source-lang ${src_lang}
......
......@@ -11,7 +11,7 @@ log-interval: 100
seed: 1
report-accuracy: True
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_baseline/avg_10_checkpoint.pt
#load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_st_vocab/avg_10_checkpoint.pt
#load-pretrained-text-encoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
#load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
......
train-subset: train_st
valid-subset: dev_st
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
load-pretrained-acoustic-encoder-from: /home/xuchen/st/checkpoints/mustc/asr/train_ctc_st_vocab/avg_10_checkpoint.pt
load-pretrained-text-encoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
load-pretrained-decoder-from: /home/xuchen/st/checkpoints/mustc/mt/st_share10k_train_baseline/avg_10_checkpoint.pt
arch: s2t_sate
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 10000
lr: 2e-3
#adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
encoder-normalize-before: True
decoder-normalize-before: True
conv-kernel-sizes: 5,5
conv-channels: 1024
dropout: 0.1
activation-fn: relu
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 12
text-encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
macaron-style: True
use-cnn-module: True
cnn-module-kernel: 31
acoustic-encoder: transformer
adapter: league
#decoder-embed-dim: 256
#decoder-ffn-embed-dim: 2048
#decoder-attention-heads: 4
#attention-dropout: 0.1
#activation-dropout: 0.1
......@@ -38,10 +38,10 @@ vocab_type=unigram
asr_vocab_size=5000
vocab_size=10000
share_dict=1
speed_perturb=1
speed_perturb=0
org_data_dir=/media/data/${dataset}
data_dir=~/st/data/${dataset}/st
data_dir=~/st/data/${dataset}/st_lcrm
test_subset=tst-COMMON
# exp
......@@ -121,6 +121,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--output-root ${data_dir}
--task st
--add-src
--lowercase-src
--rm-punc-src
--cmvn-type utterance
--vocab-type ${vocab_type}
--vocab-size ${vocab_size}"
......
......@@ -6,14 +6,15 @@ gpu_num=8
update_freq=2
max_tokens=20000
extra_tag=
extra_tag=lcrm
extra_parameter=
#extra_tag="${extra_tag}"
#extra_parameter="${extra_parameter} "
exp_tag=
train_config=train_ctc.yaml
exp_tag=baseline
train_config=train_ctc_sate.yaml
#train_config=train_ctc.yaml
cmd="./run.sh
--stage 1
......
train-subset: train
valid-subset: valid
max-epoch: 20
max-epoch: 50
max-update: 1000000
num-workers: 8
......@@ -13,6 +13,7 @@ report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
......@@ -21,22 +22,25 @@ clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 5e-4
adam_betas: (0.9,0.98)
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
encoder-attention-heads: 8
decoder-embed-dim: 256
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: dlcl_transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 16000
lr: 2e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
\ No newline at end of file
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 512
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 8
decoder-embed-dim: 512
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 8
encoder-attention-type: relative
decoder-attention-type: relative
max-relative-length: 20
train-subset: train
valid-subset: valid
max-epoch: 50
max-update: 100000
num-workers: 8
patience: 10
no-progress-bar: True
log-interval: 100
seed: 1
report-accuracy: True
skip-invalid-size-inputs-valid-test: True
#load-pretrained-encoder-from:
#load-pretrained-decoder-from:
arch: transformer
share-decoder-input-output-embed: True
optimizer: adam
clip-norm: 10.0
lr-scheduler: inverse_sqrt
warmup-init-lr: 1e-7
warmup-updates: 8000
lr: 1e-3
adam_betas: (0.9,0.997)
criterion: label_smoothed_cross_entropy
label_smoothing: 0.1
dropout: 0.1
attention-dropout: 0.1
activation-dropout: 0.1
activation-fn: relu
encoder-normalize-before: True
decoder-normalize-before: True
encoder-embed-dim: 256
encoder-ffn-embed-dim: 2048
encoder-layers: 6
decoder-layers: 6
encoder-attention-heads: 4
decoder-embed-dim: 256
decoder-ffn-embed-dim: 2048
decoder-attention-heads: 4
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论