fix the bugs

21734086 · xuchen · e1d3d2ed · 21734086
Commit 21734086 authored Jul 26, 2022 by xuchen
--- a/examples/speech_to_text/prep_audio_data.py
+++ b/examples/speech_to_text/prep_audio_data.py
@@ -185,7 +185,7 @@ class AudioDataset(Dataset):

        if need_waveform:
            offset = item.get('offset', False)
-            if offset:
+            if offset is not False:
                waveform, sample_rate = torchaudio.load(audio,
                                                        frame_offset=offset,
                                                        num_frames=item["n_frames"])
@@ -272,7 +272,11 @@ def process(args):
                waveform, sample_rate, _ = dataset.get(idx, need_waveform=True)
                if waveform.shape[1] == 0:
                    continue
-                features = extract_fbank_features(waveform, sample_rate, Path(features_path))
+
+                try:
+                    features = extract_fbank_features(waveform, sample_rate, Path(features_path))
+                except AssertionError:
+                    logger.warning("Extract file %s failed." % utt_id)

                if split == 'train' and args.cmvn_type == "global" and not utt_id.startswith("sp"):
                    if len(gcmvn_feature_list) < args.gcmvn_max_num:
@@ -326,16 +330,21 @@ def process(args):
                _, sample_rate, n_frames = dataset.get(idx, need_waveform=False)
                utt_id = item["id"]

-                manifest["id"].append(utt_id)
                if use_raw:
                    audio_path = item["audio"]

                    # add offset and frames info
-                    if item.get("offset", False):
+                    if item.get("offset", False) is not False:
                        audio_path = f"{audio_path}:{item['offset']}:{n_frames}"
                    manifest["audio"].append(audio_path)
                else:
-                    manifest["audio"].append(zip_manifest[utt_id])
+                    if utt_id in zip_manifest:
+                        manifest["audio"].append(zip_manifest[utt_id])
+                    else:
+                        logger.warning("%s is not in the zip" % utt_id)
+                        continue
+
+                manifest["id"].append(utt_id)
                duration_ms = int(n_frames / sample_rate * 1000)
                manifest["n_frames"].append(int(1 + (duration_ms - 25) / 10))