Skip to content

Commit 7c19693

Browse files
authored
Add oto_speech dataset recipe (#1552)
* Add HuggingFace audio and GDrive pseudo-label downloads * Add tar extraction caching and lazy 16kHz resampling * Add data validation to drop 0-duration segments and word alignments * Register `oto_speech` commands in Lhotse CLI * Add `prepare_oto_speech.sh` script for end-to-end cutset generation
1 parent 393b908 commit 7c19693

File tree

5 files changed

+322
-0
lines changed

5 files changed

+322
-0
lines changed

docs/corpus.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ a CLI tool that create the manifests given a corpus directory.
179179
- :func:`lhotse.recipes.prepare_notsofar1`
180180
* - National Speech Corpus (Singaporean English)
181181
- :func:`lhotse.recipes.prepare_nsc`
182+
* - otoSpeech
183+
- :func:`lhotse.recipes.prepare_oto_speech`
182184
* - People's Speech
183185
- :func:`lhotse.recipes.prepare_peoples_speech`
184186
* - ReazonSpeech

lhotse/bin/modes/recipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from .must_c import *
7171
from .notsofar1 import *
7272
from .nsc import *
73+
from .oto_speech import *
7374
from .peoples_speech import *
7475
from .primewords import *
7576
from .radio import *
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import click
2+
3+
from lhotse.bin.modes import download, prepare
4+
from lhotse.recipes.oto_speech import download_oto_speech, prepare_oto_speech
5+
from lhotse.utils import Pathlike
6+
7+
__all__ = ["oto_speech"]
8+
9+
10+
@prepare.command(context_settings=dict(show_default=True))
11+
@click.argument("corpus_dir", type=click.Path(exists=True, dir_okay=True))
12+
@click.argument("output_dir", type=click.Path())
13+
@click.option(
14+
"-p",
15+
"--dataset-parts",
16+
type=str,
17+
multiple=True,
18+
default=("train",),
19+
help="Dataset parts to prepare.",
20+
)
21+
@click.option(
22+
"--target-sr",
23+
type=int,
24+
default=16000,
25+
help="Target sampling rate for lazy resampling.",
26+
)
27+
def oto_speech(
28+
corpus_dir: Pathlike,
29+
output_dir: Pathlike,
30+
dataset_parts: tuple,
31+
target_sr: int,
32+
):
33+
"""otoSpeech data preparation."""
34+
prepare_oto_speech(
35+
corpus_dir=corpus_dir,
36+
output_dir=output_dir,
37+
parts=dataset_parts,
38+
target_sr=target_sr,
39+
)
40+
41+
42+
@download.command(context_settings=dict(show_default=True))
43+
@click.argument("target_dir", type=click.Path())
44+
@click.option(
45+
"--force-download",
46+
is_flag=True,
47+
default=False,
48+
help="Force download of audio and pseudo-labels.",
49+
)
50+
@click.option(
51+
"-p",
52+
"--dataset-parts",
53+
type=str,
54+
multiple=True,
55+
default=("train",),
56+
help="Dataset parts to download (otoSpeech standard release only provides 'train').",
57+
)
58+
@click.option(
59+
"--version",
60+
type=str,
61+
default="full-duplex-processed-141h",
62+
help="Dataset version suffix on HuggingFace.",
63+
)
64+
def oto_speech(
65+
target_dir: Pathlike,
66+
force_download: bool,
67+
dataset_parts: tuple,
68+
version: str,
69+
):
70+
"""otoSpeech dataset download."""
71+
download_oto_speech(
72+
target_dir=target_dir,
73+
parts=dataset_parts,
74+
version=version,
75+
force_download=force_download,
76+
)

lhotse/recipes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
from .mtedx import download_mtedx, prepare_mtedx
7171
from .musan import download_musan, prepare_musan
7272
from .nsc import prepare_nsc
73+
from .oto_speech import download_oto_speech, prepare_oto_speech
7374
from .peoples_speech import prepare_peoples_speech
7475
from .radio import prepare_radio
7576
from .reazonspeech import download_reazonspeech, prepare_reazonspeech

lhotse/recipes/oto_speech.py

Lines changed: 242 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
"""
2+
otoSpeech Dataset Preparation Recipe for Lhotse
3+
4+
Dataset Details:
5+
- URL: https://huggingface.co/datasets/otoearth/otoSpeech-full-duplex-processed-141h
6+
- Content: Full-duplex, spontaneous multi-speaker conversations.
7+
- Purpose: Designed for training and benchmarking S2S (speech-to-speech) or dialogue models.
8+
- Splits: This dataset provides ONLY the `train` split.
9+
10+
Pseudo Labels:
11+
- The `seglst.json` labels downloaded from Google Drive are pseudo labels generated
12+
using the Parakeet v3 model.
13+
"""
14+
15+
import json
16+
import logging
17+
import os
18+
import tarfile
19+
from collections import defaultdict
20+
from pathlib import Path
21+
from typing import Dict, Tuple, Union
22+
23+
from tqdm import tqdm
24+
25+
from lhotse import (
26+
Recording,
27+
RecordingSet,
28+
SupervisionSegment,
29+
SupervisionSet,
30+
fix_manifests,
31+
)
32+
from lhotse.supervision import AlignmentItem
33+
from lhotse.utils import Pathlike
34+
35+
# Set up the logger
36+
logger = logging.getLogger(__name__)
37+
38+
39+
def download_oto_speech(
40+
target_dir: Pathlike = ".",
41+
parts: Tuple[str, ...] = ("train",),
42+
version: str = "full-duplex-processed-141h",
43+
force_download: bool = False,
44+
) -> Path:
45+
"""
46+
Downloads the otoSpeech audio dataset from HuggingFace and pseudo labels from Google Drive.
47+
48+
Args:
49+
target_dir: Path to the directory where the dataset will be stored.
50+
parts: Which splits to download (Note: only "train" is officially provided).
51+
version: The dataset version suffix.
52+
force_download: Whether to force re-download from HuggingFace and GDrive.
53+
54+
Returns:
55+
The path to the target directory.
56+
"""
57+
try:
58+
from huggingface_hub import snapshot_download
59+
except ImportError as import_error:
60+
raise RuntimeError("Install via: pip install huggingface_hub") from import_error
61+
62+
try:
63+
import gdown
64+
except ImportError as e:
65+
raise RuntimeError("Install via: pip install gdown") from e
66+
67+
hugging_face_token = os.getenv("HF_TOKEN")
68+
if not hugging_face_token:
69+
raise RuntimeError("HF_TOKEN environment variable not found.")
70+
71+
target_dir = Path(target_dir)
72+
target_dir.mkdir(parents=True, exist_ok=True)
73+
74+
# 1. Download HuggingFace Dataset
75+
for part in parts:
76+
if part != "train":
77+
logger.warning(
78+
f"Dataset only provides a 'train' split. Downloading '{part}' may fail."
79+
)
80+
81+
logger.info(f"Downloading dataset shard for: {part}")
82+
snapshot_download(
83+
repo_id=f"otoearth/otoSpeech-{version}",
84+
repo_type="dataset",
85+
local_dir=target_dir,
86+
force_download=force_download,
87+
allow_patterns=[f"data/{part}/*"],
88+
token=hugging_face_token,
89+
)
90+
91+
# 2. Download Pseudo Labels from Google Drive
92+
labels_path = target_dir / "seglst.json"
93+
if not labels_path.exists() or force_download:
94+
logger.info(
95+
"Downloading Parakeet v3 pseudo labels (seglst.json) from Google Drive..."
96+
)
97+
url = "https://drive.google.com/file/d/16htmj5O14D51C-EjOUMF_cXOxo6vruui/view?usp=sharing"
98+
gdown.download(url, str(labels_path), quiet=False, fuzzy=True)
99+
else:
100+
logger.info(
101+
"Parakeet v3 pseudo labels (seglst.json) already exist. Skipping download."
102+
)
103+
104+
return target_dir
105+
106+
107+
def extract_and_flatten_tar(tar_path: Path, extract_dir: Path):
108+
"""Extracts a tar file, flattens contents, and caches the result using a marker."""
109+
marker_file = extract_dir / f"{tar_path.name}.done"
110+
111+
# Cache check: if the marker exists, we already unpacked this shard
112+
if marker_file.exists():
113+
return
114+
115+
with tarfile.open(tar_path) as tar:
116+
tar.extractall(path=extract_dir)
117+
118+
# Flatten structure and ignore marker files
119+
for p in extract_dir.rglob("*"):
120+
if p.is_file() and p.parent != extract_dir and p.suffix != ".done":
121+
target_path = extract_dir / p.name
122+
if not target_path.exists():
123+
p.rename(target_path)
124+
125+
# Create the marker file to register this tar as "done"
126+
marker_file.touch()
127+
128+
129+
def prepare_oto_speech(
130+
corpus_dir: Pathlike,
131+
output_dir: Pathlike,
132+
parts: Tuple[str, ...] = ("train",),
133+
target_sr: int = 16000,
134+
) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]:
135+
"""Prepares the dataset, utilizing Lhotse's lazy resampling and extraction caching."""
136+
corpus_dir = Path(corpus_dir)
137+
data_dir = corpus_dir / "data"
138+
output_dir = Path(output_dir)
139+
output_dir.mkdir(parents=True, exist_ok=True)
140+
141+
labels_path = corpus_dir / "seglst.json"
142+
if not labels_path.exists():
143+
raise FileNotFoundError(
144+
f"Labels not found at {labels_path}. Please run download_oto_speech() first."
145+
)
146+
147+
with open(labels_path, "r", encoding="utf-8") as f:
148+
logger.info(f"Loading Parakeet v3 pseudo metadata from {labels_path}...")
149+
label_data = json.load(f)
150+
151+
manifests = defaultdict(dict)
152+
153+
for part in parts:
154+
if part != "train":
155+
logger.warning(
156+
f"Preparing split '{part}', but standard otoSpeech only guarantees 'train'."
157+
)
158+
159+
part_dir = data_dir / part
160+
unpacked_dir = part_dir / "unpacked"
161+
unpacked_dir.mkdir(parents=True, exist_ok=True)
162+
163+
# 1. Untar the downloaded shards (cached)
164+
logger.info(f"--- [1/3] Extracting {part} ---")
165+
tar_files = list(part_dir.glob("*.tar"))
166+
for tar_path in tqdm(tar_files, desc="Extracting tar files"):
167+
extract_and_flatten_tar(tar_path, unpacked_dir)
168+
169+
# 2. Create RecordingSet and apply lazy resampling
170+
logger.info(
171+
f"--- [2/3] Building RecordingSet (with lazy {target_sr}Hz resampling) ---"
172+
)
173+
audio_paths = list(unpacked_dir.glob("*.flac"))
174+
175+
recordings = RecordingSet.from_recordings(
176+
Recording.from_file(p) for p in tqdm(audio_paths, desc="Parsing audio")
177+
)
178+
recordings = recordings.resample(target_sr)
179+
180+
# 3. Create SupervisionSet from the GDrive JSON
181+
logger.info("--- [3/3] Building SupervisionSet ---")
182+
supervisions = []
183+
184+
for idx, seg in tqdm(
185+
enumerate(label_data), total=len(label_data), desc="Parsing labels"
186+
):
187+
rec_id = seg["session_id"]
188+
189+
if rec_id not in recordings:
190+
continue
191+
192+
start = seg["start_time"]
193+
end = seg["end_time"]
194+
duration = round(end - start, 4)
195+
196+
if duration <= 0:
197+
logger.warning(
198+
f"Skipped segment for rec: {rec_id} at {start} due to 0 duration"
199+
)
200+
continue
201+
202+
alignments = []
203+
if "word_alignment" in seg:
204+
for w_text, w_start, w_end in seg["word_alignment"]:
205+
alignments.append(
206+
AlignmentItem(
207+
symbol=w_text,
208+
start=round(w_start - start, 4),
209+
duration=round(w_end - w_start, 4),
210+
)
211+
)
212+
213+
supervisions.append(
214+
SupervisionSegment(
215+
id=f"{rec_id}-{idx}",
216+
recording_id=rec_id,
217+
start=start,
218+
duration=duration,
219+
channel=0,
220+
text=seg["words"],
221+
speaker=seg["speaker"],
222+
language="en",
223+
alignment={"word": alignments} if alignments else None,
224+
)
225+
)
226+
227+
supervision_set = SupervisionSet.from_segments(supervisions)
228+
229+
logger.info("Fixing and validating manifests...")
230+
recordings, supervision_set = fix_manifests(recordings, supervision_set)
231+
232+
recordings_path = output_dir / f"oto_recordings_{part}.jsonl.gz"
233+
supervisions_path = output_dir / f"oto_supervisions_{part}.jsonl.gz"
234+
235+
recordings.to_file(recordings_path)
236+
supervision_set.to_file(supervisions_path)
237+
238+
logger.info(f"Saved to:\n - {recordings_path}\n - {supervisions_path}")
239+
240+
manifests[part] = {"recordings": recordings, "supervisions": supervision_set}
241+
242+
return dict(manifests)

0 commit comments

Comments
 (0)