Skip to content

Commit 5e563d9

Browse files
authored
Notsofar ihm recipe (#1551)
added support for notsofar ihm prep
1 parent f2d2411 commit 5e563d9

File tree

2 files changed

+89
-5
lines changed

2 files changed

+89
-5
lines changed

lhotse/bin/modes/recipes/notsofar1.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def notsofar1(
4242
@click.option(
4343
"--mic",
4444
type=click.Choice(
45-
["sdm", "mdm"],
45+
["ihm", "sdm", "mdm"],
4646
case_sensitive=False,
4747
),
4848
default="sdm",

lhotse/recipes/notsofar1.py

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,10 @@ def download_notsofar1(
6565
download_patterns.append(
6666
f"benchmark-datasets/{subset_name}/{version}/MTG/*/mc_*"
6767
)
68-
68+
elif mic == "ihm":
69+
download_patterns.append(
70+
f"benchmark-datasets/{subset_name}/{version}/MTG/*/close_talk*"
71+
)
6972
snapshot_download(
7073
repo_id="microsoft/NOTSOFAR",
7174
repo_type="dataset",
@@ -97,7 +100,7 @@ def prepare_notsofar1(
97100

98101
for version in _listdir_safe(part_dir):
99102
version_dir = part_dir / version / "MTG"
100-
sc_cuts, mc_cuts = process_data(
103+
sc_cuts, mc_cuts, ihm_cuts = process_data(
101104
version_dir, word_level=False, create_word_alignment=True
102105
)
103106
manifests[part][version] = defaultdict(dict)
@@ -126,6 +129,17 @@ def prepare_notsofar1(
126129
"supervisions": mc_sups,
127130
}
128131

132+
if ihm_cuts:
133+
ihm_recs, ihm_sups = fix_manifests(
134+
*CutSet.from_cuts(ihm_cuts).decompose()[:2]
135+
)
136+
tag = f"notsofar1_ihm_{part}_{version}"
137+
ihm_recs.to_file(output_dir / f"{tag}_recordings.jsonl.gz")
138+
ihm_sups.to_file(output_dir / f"{tag}_supervisions.jsonl.gz")
139+
manifests[part][version]["close_talk"] = {
140+
"recordings": ihm_recs,
141+
"supervisions": ihm_sups,
142+
}
129143
return manifests
130144

131145

@@ -139,26 +153,96 @@ def process_data(
139153
meetings = sorted(_listdir_safe(dataset_path))
140154
sc_cuts = []
141155
mc_cuts = []
156+
ihm_cuts = []
142157

143158
for meeting in tqdm(meetings):
144159
meeting_root = dataset_path / meeting
145160
transcription_path = meeting_root / "gt_transcription.json"
146161
devices = sorted(
147162
list(
148163
filter(
149-
lambda x: x != "close_talk" and os.path.isdir(meeting_root / x),
164+
lambda x: os.path.isdir(meeting_root / x),
150165
_listdir_safe(meeting_root),
151166
)
152167
)
153168
)
154169

170+
metadata_path = meeting_root / "gt_meeting_metadata.json"
171+
ct_device_to_speaker = {}
172+
if metadata_path.exists():
173+
with open(metadata_path, "r") as f:
174+
metadata = json.load(f)
175+
alias_to_ct = metadata.get("ParticipantAliasToCtDevice", {})
176+
ct_device_to_speaker = {v: k for k, v in alias_to_ct.items()}
177+
155178
with open(transcription_path, "r") as f:
156179
transcription_json = json.load(f)
157180

158181
for device in devices:
159182
device_path = meeting_root / device
160183
device_id = f"{meeting}_{device}"
161184
is_multi_channel = "mc" in device
185+
is_close_talk = "close_talk" in device
186+
187+
if is_close_talk:
188+
ct_wav_files = sorted(
189+
f for f in _listdir_safe(device_path) if f.endswith(".wav")
190+
)
191+
for ct_wav in ct_wav_files:
192+
ct_device_id = Path(ct_wav).stem # e.g. "CT_21"
193+
speaker = ct_device_to_speaker.get(ct_device_id, ct_device_id)
194+
ct_recording = Recording.from_file(device_path / ct_wav)
195+
ct_recording_id = f"{meeting}_close_talk_{ct_device_id}"
196+
ct_recording.id = ct_recording_id
197+
198+
speaker_supervisions = []
199+
for segment in transcription_json:
200+
if segment["speaker_id"] != speaker:
201+
continue
202+
start_time = float(segment["start_time"])
203+
end_time = float(segment["end_time"])
204+
alignment = None
205+
206+
if create_word_alignment:
207+
alignment = {"word": []}
208+
for alig_text, alig_start_time, alig_end_time in segment[
209+
"word_timing"
210+
]:
211+
if "<" in alig_text or ">" in alig_text:
212+
continue
213+
alignment["word"].append(
214+
AlignmentItem(
215+
symbol=alig_text,
216+
start=float(alig_start_time),
217+
duration=float(alig_end_time)
218+
- float(alig_start_time),
219+
)
220+
)
221+
222+
speaker_supervisions.append(
223+
SupervisionSegment(
224+
id=f"{ct_recording_id}_{str(int(start_time * 100)).zfill(6)}_{str(int(end_time * 100)).zfill(6)}",
225+
recording_id=ct_recording_id,
226+
start=start_time,
227+
duration=end_time - start_time,
228+
channel=0,
229+
text=segment["text"],
230+
speaker=speaker,
231+
alignment=alignment,
232+
)
233+
)
234+
235+
ihm_cuts.append(
236+
MonoCut(
237+
id=ct_recording_id,
238+
start=0,
239+
duration=ct_recording.duration,
240+
channel=0,
241+
supervisions=speaker_supervisions,
242+
recording=ct_recording,
243+
)
244+
)
245+
continue # skip sc/mc append logic for ihm devices
162246
if is_multi_channel:
163247
# We assume the channel numbers range from 0 to num_channels - 1.
164248
num_channels = len(_listdir_safe(device_path))
@@ -242,4 +326,4 @@ def process_data(
242326
)
243327
)
244328

245-
return sc_cuts, mc_cuts
329+
return sc_cuts, mc_cuts, ihm_cuts

0 commit comments

Comments
 (0)