update shell scripts

e1d3d2ed · xuchen · de9ef921 · e1d3d2ed · e1d3d2ed · e1d3d2ed
Commit e1d3d2ed authored Jul 25, 2022 by xuchen
--- a/egs/tibetan/asr/conf/basis.yaml
+++ b/egs/tibetan/asr/conf/basis.yaml
@@ -6,6 +6,7 @@ max-update: 100000
 patience: 20
 best_checkpoint_metric: loss
 maximize_best_checkpoint_metric: False
+post-process: sentencepiece

 no-epoch-checkpoints: True
 #keep-last-epochs: 10

--- a/egs/tibetan/asr/conf/big.yaml
+++ b/egs/tibetan/asr/conf/big.yaml
@@ -11,6 +11,9 @@ adam_betas: (0.9,0.98)
 criterion: label_smoothed_cross_entropy_with_ctc
 label_smoothing: 0.1

+encoder-embed-norm: True
+encoder-no-scale-embedding: True
+
 subsampling-type: conv1d
 subsampling-layers: 2
 subsampling-filter: 2048

--- a/egs/tibetan/asr/conf/ctc.yaml
+++ b/egs/tibetan/asr/conf/ctc.yaml
 ctc-weight: 0.3
-post-process: sentencepiece
+share-ctc-and-embed: True
\ No newline at end of file
--- a/egs/tibetan/asr/conf/inter.yaml
+++ b/egs/tibetan/asr/conf/inter.yaml
@@ -4,27 +4,12 @@ share-target-ctc-and-embed: True

 interleaved-ctc-weight: 0.2
 interleaved-ctc-layers: 6,9
+sae-ctc-temperature: 1.0
 interleaved-ctc-drop-prob: 0

-#target-ctc-weight: 0.3
-#target-ctc-layer: 6
-#target-interleaved-ctc-weight: 0.2
-#target-interleaved-ctc-layers: 2,4
-
-#sae-ground-truth-ratio: 0.1
-
 sae-adapter: inter_league
-sae-ctc-temperature: 1
-#sae-gumbel: True
-#sae-distribution-hard: True
-#sae-drop-prob: 0.0
-#sae-distribution-cutoff: 10
-#share-sae-and-ctc: True
-#share-target-sae-and-ctc: True
-#sae-embed-norm: True
-#sae-out-norm: True
+sae-drop-prob: 0.0
+sae-distribution-cutoff: 0
+share-ctc-and-sae: False

-#ctc-self-distill-weight: 1
-#target-ctc-self-distill-weight: 1
-#ctc-self-distill-prob: 0.1
-#cal-all-ctc: True
+ctc-self-distill-weight: 0
--- a/egs/tibetan/asr/local/utils.sh
+++ b/egs/tibetan/asr/local/utils.sh
@@ -14,7 +14,7 @@ get_devices(){
        do
            line=$((dev + 2))
            use=$(head -n $line ${record} | tail -1 | cut -d '|' -f3 | cut -d '/' -f1)
-            if [[ $use -lt 100 ]]; then
+            if [[ $use -lt 1000 ]]; then
                device[$count]=$dev
                count=$((count + 1))
                if [[ $count -eq $gpu_num ]]; then

--- a/egs/tibetan/asr/run.sh
+++ b/egs/tibetan/asr/run.sh
 #! /bin/bash

-# Processing MuST-C Datasets
+# Processing Tibetan ASR Dataset

 # Copyright 2021 Natural Language Processing Laboratory 
 # Xu Chen (xuchenneu@163.com)
@@ -30,8 +30,6 @@ pwd_dir=$PWD

 # dataset
 src_lang=ti
-tgt_lang=de
-lang=${src_lang}-${tgt_lang}

 dataset=tibetan
 subset=seda
@@ -39,10 +37,10 @@ task=speech_to_text
 vocab_type=unigram
 vocab_type=char
 #vocab_type=word
-vocab_size=1700
+vocab_size=5000
 speed_perturb=0
 lcrm=0
-tokenizer=1
+tokenizer=0
 use_raw_audio=0

 use_specific_dict=0
@@ -54,9 +52,9 @@ if [[ -n ${subset} ]]; then
    dataset=${dataset}/${subset}
 fi
 org_data_dir=${root_dir}/data/${dataset}
-data_dir=${root_dir}/data/${dataset}/asr_char
-data_dir=${root_dir}/data/${dataset}/asr_word
-#data_dir=${root_dir}/data/${dataset}/asr
+#data_dir=${root_dir}/data/${dataset}/asr_char
+#data_dir=${root_dir}/data/${dataset}/asr_word
+data_dir=${root_dir}/data/${dataset}/asr
 train_split=train
 valid_split=dev
 test_split=test
@@ -105,6 +103,10 @@ if [[ ${use_raw_audio} -eq 1 ]]; then
    data_dir=${data_dir}_raw
    exp_prefix=${exp_prefix}_raw
 fi
+if [[ "${vocab_type}" == "char" ]]; then
+    data_dir=${data_dir}_char
+    exp_prefix=${exp_prefix}_char
+fi

 . ./local/parse_options.sh || exit 1;

@@ -268,10 +270,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        cmd="${cmd}
        --save-interval $save_interval "
    fi
-    if [[ -n $keep_last_epochs ]]; then
-        cmd="${cmd}
-        --keep-last-epochs $keep_last_epochs "
-    fi
    if [[ -n $save_interval_updates ]]; then
        cmd="${cmd}
        --save-interval-updates $save_interval_updates"
@@ -290,11 +288,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    mv tmp.log $log
    export CUDA_VISIBLE_DEVICES=${device}

-    cmd="nohup ${cmd} >> ${model_dir}/train.log 2>&1 &"
+    log=${model_dir}/train.log
+    cmd="nohup ${cmd} >> ${log} 2>&1 &"
    if [[ $eval -eq 1 ]]; then
 		eval $cmd
 		sleep 2s
-		tail -n "$(wc -l ${model_dir}/train.log | awk '{print $1+1}')" -f ${model_dir}/train.log
+		tail -n "$(wc -l ${log} | awk '{print $1+1}')" -f ${log}
 	fi
 fi
 wait
@@ -327,7 +326,16 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    fi
    export CUDA_VISIBLE_DEVICES=${device}

-	result_file=${model_dir}/decode_result
+    suffix=beam${beam_size}_alpha${len_penalty}_tokens${max_tokens}
+    if [[ -z ${cer} && ${cer} -eq 1 ]]; then
+        suffix=${suffix}_cer
+    else
+        suffix=${suffix}_wer
+    fi
+    if [[ ${n_average} -ne 1 ]]; then
+        suffix=${suffix}_${n_average}
+    fi
+	result_file=${model_dir}/decode_result_${suffix}
 	[[ -f ${result_file} ]] && rm ${result_file}

    test_subset=${test_subset//,/ }