modify the preprocessing of the librispeech

eff10263 · xuchen · 02ec8720 · eff10263 · eff10263 · eff10263
Commit eff10263 authored Apr 04, 2021 by xuchen
--- a/egs/librispeech/asr/conf/train.yaml
+++ b/egs/librispeech/asr/conf/train.yaml
+train-subset: train-clean-100,train-clean-360,train-other-500
+#train-subset: train-clean-100
+valid-subset: dev-clean
+
+max-epoch: 100
+max-update: 300000
+
+num-workers: 8
+patience: 10
+no-progress-bar: True
+log-interval: 100
+seed: 1
+report-accuracy: True
+
+arch: s2t_conformer_s
+share-decoder-input-output-embed: True
+optimizer: adam
+clip-norm: 10.0
+lr-scheduler: inverse_sqrt
+warmup-init-lr: 1e-7
+warmup-updates: 10000
+lr: 2e-3
+#adam_betas: (0.9,0.98)
+
+criterion: label_smoothed_cross_entropy
+label_smoothing: 0.1
+
+conv-kernel-sizes: 5,5
+conv-channels: 1024
+dropout: 0.1
+activation-fn: relu
+encoder-embed-dim: 256
+encoder-ffn-embed-dim: 2048
+encoder-layers: 12
+decoder-layers: 6
+encoder-attention-heads: 4
+
+#decoder-embed-dim: 256
+#decoder-ffn-embed-dim: 2048
+#decoder-attention-heads: 4
+#attention-dropout: 0.1
+#activation-dropout: 0.1
--- a/egs/librispeech/asr/run.sh
+++ b/egs/librispeech/asr/run.sh
@@ -35,8 +35,9 @@ dataset=librispeech
 task=speech_to_text
 vocab_type=unigram
 vocab_size=10000
+speed_perturb=0

-org_data_dir=/meida/data/${dataset}
+org_data_dir=/media/data/${dataset}
 data_dir=~/st/data/${dataset}
 test_subset=(dev-clean dev-other test-clean test-other)

@@ -79,8 +80,14 @@ if [[ -z ${exp_name} ]]; then
    if [[ -n ${extra_tag} ]]; then
        exp_name=${exp_name}_${extra_tag}
    fi
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        exp_name=sp_${exp_name}
+    fi
 fi

+if [[ ${speed_perturb} -eq 1 ]]; then
+    data_dir=${data_dir}_sp
+fi
 model_dir=$root_dir/../checkpoints/$dataset/asr/${exp_name}

 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
@@ -92,18 +99,28 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    ### Task dependent. You have to make data the following preparation part by yourself.
    ### But you can utilize Kaldi recipes in most cases
    echo "stage 0: Data Preparation"
+
+    if [[ ! -e ${data_dir} ]]; then
+        mkdir -p ${data_dir}
+    fi
+    source ~/tools/audio/bin/activate
+
    cmd="python ${root_dir}/examples/speech_to_text/prep_librispeech_data.py
        --data-root ${org_data_dir}
        --output-root ${data_dir}
        --vocab-type ${vocab_type}
        --vocab-size ${vocab_size}"
+    if [[ ${speed_perturb} -eq 1 ]]; then
+        cmd="$cmd
+        --speed-perturb"
+    fi
    echo -e "\033[34mRun command: \n${cmd} \033[0m"
    [[ $eval -eq 1 ]] && eval $cmd
 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: ASR Network Training"
-    [[ ! -d ${data_dir} ]] && echo "The data dir $data_dir is not existing!" && exit 1;
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;

    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
 		if [[ ${gpu_num} -eq 0 ]]; then

--- a/egs/mustc/asr/run.sh
+++ b/egs/mustc/asr/run.sh
@@ -123,7 +123,7 @@ data_dir=${data_dir}/${lang}

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    echo "stage 1: ASR Network Training"
-    [[ ! -d $data_dir ]] && echo "The data dir $data_dir is not existing!" && exit 1;
+    [[ ! -d ${data_dir} ]] && echo "The data dir ${data_dir} is not existing!" && exit 1;

    if [[ -z ${device} || ${#device[@]} -eq 0 ]]; then
 		if [[ ${gpu_num} -eq 0 ]]; then

--- a/examples/speech_to_text/prep_librispeech_data.py
+++ b/examples/speech_to_text/prep_librispeech_data.py
@@ -39,6 +39,7 @@ MANIFEST_COLUMNS = ["id", "audio", "n_frames", "tgt_text", "speaker"]


 def process(args):
+    data_root = Path(args.data_root).absolute()
    out_root = Path(args.output_root).absolute()
    out_root.mkdir(exist_ok=True)
    # Extract features
@@ -48,7 +49,7 @@ def process(args):
    if args.overwrite or not Path.exists(zip_path):
        for split in SPLITS:
            print(f"Fetching split {split}...")
-            dataset = LIBRISPEECH(out_root.as_posix(), url=split, download=True)
+            dataset = LIBRISPEECH(data_root.as_posix(), url=split, download=True)
            print("Extracting log mel filter bank features...")
            for wav, sample_rate, _, spk_id, chapter_no, utt_no in tqdm(dataset):
                sample_id = f"{spk_id}-{chapter_no}-{utt_no}"
@@ -96,7 +97,7 @@ def process(args):
            print("Loading the training text...")
            for split in SPLITS:
                if split.startswith("train"):
-                    dataset = LIBRISPEECH(out_root.as_posix(), url=split)
+                    dataset = LIBRISPEECH(data_root.as_posix(), url=split)
                    for wav, sample_rate, utt, spk_id, chapter_no, utt_no in dataset:
                        train_text.append(utt.lower())
        for t in train_text:
@@ -119,6 +120,7 @@ def process(args):

 def main():
    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-root", "-d", required=True, type=str)
    parser.add_argument("--output-root", "-o", required=True, type=str)
    parser.add_argument(
        "--vocab-type",