Commit e1d3d2ed by xuchen

update shell scripts

parent de9ef921
......@@ -6,6 +6,7 @@ max-update: 100000
patience: 20
best_checkpoint_metric: loss
maximize_best_checkpoint_metric: False
post-process: sentencepiece
no-epoch-checkpoints: True
#keep-last-epochs: 10
......
......@@ -11,6 +11,9 @@ adam_betas: (0.9,0.98)
criterion: label_smoothed_cross_entropy_with_ctc
label_smoothing: 0.1
encoder-embed-norm: True
encoder-no-scale-embedding: True
subsampling-type: conv1d
subsampling-layers: 2
subsampling-filter: 2048
......
ctc-weight: 0.3
post-process: sentencepiece
share-ctc-and-embed: True
\ No newline at end of file
......@@ -4,27 +4,12 @@ share-target-ctc-and-embed: True
interleaved-ctc-weight: 0.2
interleaved-ctc-layers: 6,9
sae-ctc-temperature: 1.0
interleaved-ctc-drop-prob: 0
#target-ctc-weight: 0.3
#target-ctc-layer: 6
#target-interleaved-ctc-weight: 0.2
#target-interleaved-ctc-layers: 2,4
#sae-ground-truth-ratio: 0.1
sae-adapter: inter_league
sae-ctc-temperature: 1
#sae-gumbel: True
#sae-distribution-hard: True
#sae-drop-prob: 0.0
#sae-distribution-cutoff: 10
#share-sae-and-ctc: True
#share-target-sae-and-ctc: True
#sae-embed-norm: True
#sae-out-norm: True
sae-drop-prob: 0.0
sae-distribution-cutoff: 0
share-ctc-and-sae: False
#ctc-self-distill-weight: 1
#target-ctc-self-distill-weight: 1
#ctc-self-distill-prob: 0.1
#cal-all-ctc: True
ctc-self-distill-weight: 0
......@@ -14,7 +14,7 @@ get_devices(){
do
line=$((dev + 2))
use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
if [[ $use -lt 100 ]]; then
if [[ $use -lt 1000 ]]; then
device[$count]=$dev
count=$((count + 1))
if [[ $count -eq $gpu_num ]]; then
......
#! /bin/bash
# Processing MuST-C Datasets
# Processing Tibetan ASR Dataset
# Copyright 2021 Natural Language Processing Laboratory
# Xu Chen (xuchenneu@163.com)
......@@ -30,8 +30,6 @@ pwd_dir=$PWD
# dataset
src_lang=ti
tgt_lang=de
lang=${src_lang}-${tgt_lang}
dataset=tibetan
subset=seda
......@@ -39,10 +37,10 @@ task=speech_to_text
vocab_type=unigram
vocab_type=char
#vocab_type=word
vocab_size=1700
vocab_size=5000
speed_perturb=0
lcrm=0
tokenizer=1
tokenizer=0
use_raw_audio=0
use_specific_dict=0
......@@ -54,9 +52,9 @@ if [[ -n ${subset} ]]; then
dataset=${dataset}/${subset}
fi
org_data_dir=${root_dir}/data/${dataset}
data_dir=${root_dir}/data/${dataset}/asr_char
data_dir=${root_dir}/data/${dataset}/asr_word
#data_dir=${root_dir}/data/${dataset}/asr
#data_dir=${root_dir}/data/${dataset}/asr_char
#data_dir=${root_dir}/data/${dataset}/asr_word
data_dir=${root_dir}/data/${dataset}/asr
train_split=train
valid_split=dev
test_split=test
......@@ -105,6 +103,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
data_dir=${data_dir}_raw
exp_prefix=${exp_prefix}_raw
fi
if [[ "${vocab_type}" == "char" ]]; then
data_dir=${data_dir}_char
exp_prefix=${exp_prefix}_char
fi
. ./local/parse_options.sh || exit 1;
......@@ -268,10 +270,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
cmd="${cmd}
--save-interval $save_interval "
fi
if [[ -n $keep_last_epochs ]]; then
cmd="${cmd}
--keep-last-epochs $keep_last_epochs "
fi
if [[ -n $save_interval_updates ]]; then
cmd="${cmd}
--save-interval-updates $save_interval_updates"
......@@ -290,11 +288,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
mv tmp.log $log
export CUDA_VISIBLE_DEVICES=${device}
cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
log=${model_dir}/train.log
cmd="nohup ${cmd} >> ${log} 2>&1 &"
if [[ $eval -eq 1 ]]; then
eval $cmd
sleep 2s
tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
fi
fi
wait
......@@ -327,7 +326,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
fi
export CUDA_VISIBLE_DEVICES=${device}
result_file=${model_dir}/decode_result
suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
if [[ -z ${cer} && ${cer} -eq 1 ]]; then
suffix=${suffix}_cer
else
suffix=${suffix}_wer
fi
if [[ ${n_average} -ne 1 ]]; then
suffix=${suffix}_${n_average}
fi
result_file=${model_dir}/decode_result_${suffix}
[[ -f ${result_file} ]] && rm ${result_file}
test_subset=${test_subset//,/ }
......
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论