Commit e1d3d2ed by xuchen

update shell scripts

parent de9ef921
...@@ -6,6 +6,7 @@ max-update: 100000 ...@@ -6,6 +6,7 @@ max-update: 100000
patience: 20 patience: 20
best_checkpoint_metric: loss best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False maximize_best_checkpoint_metric: False
post-process: sentencepiece
no-epoch-checkpoints: True no-epoch-checkpoints: True
#keep-last-epochs: 10 #keep-last-epochs: 10
......
...@@ -11,6 +11,9 @@ adam_betas: (0.9,0.98) ...@@ -11,6 +11,9 @@ adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1 label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d subsampling-type: conv1d
subsampling-layers: 2 subsampling-layers: 2
subsampling-filter: 2048 subsampling-filter: 2048
......
ctc-weight: 0.3 ctc-weight: 0.3
post-process: sentencepiece share-ctc-and-embed: True
\ No newline at end of file
...@@ -4,27 +4,12 @@ share-target-ctc-and-embed: True ...@@ -4,27 +4,12 @@ share-target-ctc-and-embed: True
interleaved-ctc-weight: 0.2 interleaved-ctc-weight: 0.2
interleaved-ctc-layers: 6,9 interleaved-ctc-layers: 6,9
sae-ctc-temperature: 1.0
interleaved-ctc-drop-prob: 0 interleaved-ctc-drop-prob: 0
#target-ctc-weight: 0.3
#target-ctc-layer: 6
#target-interleaved-ctc-weight: 0.2
#target-interleaved-ctc-layers: 2,4
#sae-ground-truth-ratio: 0.1
sae-adapter: inter_league sae-adapter: inter_league
sae-ctc-temperature: 1 sae-drop-prob: 0.0
#sae-gumbel: True sae-distribution-cutoff: 0
#sae-distribution-hard: True share-ctc-and-sae: False
#sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
#share-sae-and-ctc: True
#share-target-sae-and-ctc: True
#sae-embed-norm: True
#sae-out-norm: True
#ctc-self-distill-weight: 1 ctc-self-distill-weight: 0
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
...@@ -14,7 +14,7 @@ get_devices(){ ...@@ -14,7 +14,7 @@ get_devices(){
do do
line=$((dev + 2)) line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1) use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then if [[ $use -lt 1000 ]]; then
device[$count]=$dev device[$count]=$dev
count=$((count + 1)) count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then if [[ $count -eq $gpu_num ]]; then
......
#! /bin/bash #! /bin/bash
# Processing MuST-C Datasets # Processing Tibetan ASR Dataset
# Copyright 2021 Natural Language Processing Laboratory # Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com) # Xu Chen (xuchenneu@163.com)
...@@ -30,8 +30,6 @@ pwd_dir=$PWD ...@@ -30,8 +30,6 @@ pwd_dir=$PWD
# dataset # dataset
src_lang=ti src_lang=ti
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=tibetan dataset=tibetan
subset=seda subset=seda
...@@ -39,10 +37,10 @@ task=speech_to_text ...@@ -39,10 +37,10 @@ task=speech_to_text
vocab_type=unigram vocab_type=unigram
vocab_type=char vocab_type=char
#vocab_type=word #vocab_type=word
vocab_size=1700 vocab_size=5000
speed_perturb=0 speed_perturb=0
lcrm=0 lcrm=0
tokenizer=1 tokenizer=0
use_raw_audio=0 use_raw_audio=0
use_specific_dict=0 use_specific_dict=0
...@@ -54,9 +52,9 @@ if [[ -n ${subset} ]]; then ...@@ -54,9 +52,9 @@ if [[ -n ${subset} ]]; then
dataset=${dataset}/${subset} dataset=${dataset}/${subset}
fi fi
org_data_dir=${root_dir}/data/${dataset} org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr_char #data_dir=${root_dir}/data/${dataset}/asr_char
data_dir=${root_dir}/data/${dataset}/asr_word #data_dir=${root_dir}/data/${dataset}/asr_word
#data_dir=${root_dir}/data/${dataset}/asr data_dir=${root_dir}/data/${dataset}/asr
train_split=train train_split=train
valid_split=dev valid_split=dev
test_split=test test_split=test
...@@ -105,6 +103,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then ...@@ -105,6 +103,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw exp_prefix=${exp_prefix}_raw
fi fi
if [[ "${vocab_type}" == "char" ]]; then
data_dir=${data_dir}_char
exp_prefix=${exp_prefix}_char
fi
. ./local/parse_options.sh || exit 1; . ./local/parse_options.sh || exit 1;
...@@ -268,10 +270,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -268,10 +270,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd="${cmd} cmd="${cmd}
--save-interval $save_interval " --save-interval $save_interval "
fi fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then if [[ -n $save_interval_updates ]]; then
cmd="${cmd} cmd="${cmd}
--save-interval-updates $save_interval_updates" --save-interval-updates $save_interval_updates"
...@@ -290,11 +288,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -290,11 +288,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log $log mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &" log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then if [[ $eval -eq 1 ]]; then
eval $cmd eval $cmd
sleep 2s sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi fi
fi fi
wait wait
...@@ -327,7 +326,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -327,7 +326,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi fi
export CUDA_VISIBLE_DEVICES=${device} export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
if [[ -z ${cer} && ${cer} -eq 1 ]]; then
suffix=${suffix}_cer
else
suffix=${suffix}_wer
fi
if [[ ${n_average} -ne 1 ]]; then
suffix=${suffix}_${n_average}
fi
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file} [[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ } test_subset=${test_subset//,/ }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论