@@ -65,7 +65,10 @@ def download_notsofar1(
6565 download_patterns .append (
6666 f"benchmark-datasets/{ subset_name } /{ version } /MTG/*/mc_*"
6767 )
68-
68+ elif mic == "ihm" :
69+ download_patterns .append (
70+ f"benchmark-datasets/{ subset_name } /{ version } /MTG/*/close_talk*"
71+ )
6972 snapshot_download (
7073 repo_id = "microsoft/NOTSOFAR" ,
7174 repo_type = "dataset" ,
@@ -97,7 +100,7 @@ def prepare_notsofar1(
97100
98101 for version in _listdir_safe (part_dir ):
99102 version_dir = part_dir / version / "MTG"
100- sc_cuts , mc_cuts = process_data (
103+ sc_cuts , mc_cuts , ihm_cuts = process_data (
101104 version_dir , word_level = False , create_word_alignment = True
102105 )
103106 manifests [part ][version ] = defaultdict (dict )
@@ -126,6 +129,17 @@ def prepare_notsofar1(
126129 "supervisions" : mc_sups ,
127130 }
128131
132+ if ihm_cuts :
133+ ihm_recs , ihm_sups = fix_manifests (
134+ * CutSet .from_cuts (ihm_cuts ).decompose ()[:2 ]
135+ )
136+ tag = f"notsofar1_ihm_{ part } _{ version } "
137+ ihm_recs .to_file (output_dir / f"{ tag } _recordings.jsonl.gz" )
138+ ihm_sups .to_file (output_dir / f"{ tag } _supervisions.jsonl.gz" )
139+ manifests [part ][version ]["close_talk" ] = {
140+ "recordings" : ihm_recs ,
141+ "supervisions" : ihm_sups ,
142+ }
129143 return manifests
130144
131145
@@ -139,26 +153,96 @@ def process_data(
139153 meetings = sorted (_listdir_safe (dataset_path ))
140154 sc_cuts = []
141155 mc_cuts = []
156+ ihm_cuts = []
142157
143158 for meeting in tqdm (meetings ):
144159 meeting_root = dataset_path / meeting
145160 transcription_path = meeting_root / "gt_transcription.json"
146161 devices = sorted (
147162 list (
148163 filter (
149- lambda x : x != "close_talk" and os .path .isdir (meeting_root / x ),
164+ lambda x : os .path .isdir (meeting_root / x ),
150165 _listdir_safe (meeting_root ),
151166 )
152167 )
153168 )
154169
170+ metadata_path = meeting_root / "gt_meeting_metadata.json"
171+ ct_device_to_speaker = {}
172+ if metadata_path .exists ():
173+ with open (metadata_path , "r" ) as f :
174+ metadata = json .load (f )
175+ alias_to_ct = metadata .get ("ParticipantAliasToCtDevice" , {})
176+ ct_device_to_speaker = {v : k for k , v in alias_to_ct .items ()}
177+
155178 with open (transcription_path , "r" ) as f :
156179 transcription_json = json .load (f )
157180
158181 for device in devices :
159182 device_path = meeting_root / device
160183 device_id = f"{ meeting } _{ device } "
161184 is_multi_channel = "mc" in device
185+ is_close_talk = "close_talk" in device
186+
187+ if is_close_talk :
188+ ct_wav_files = sorted (
189+ f for f in _listdir_safe (device_path ) if f .endswith (".wav" )
190+ )
191+ for ct_wav in ct_wav_files :
192+ ct_device_id = Path (ct_wav ).stem # e.g. "CT_21"
193+ speaker = ct_device_to_speaker .get (ct_device_id , ct_device_id )
194+ ct_recording = Recording .from_file (device_path / ct_wav )
195+ ct_recording_id = f"{ meeting } _close_talk_{ ct_device_id } "
196+ ct_recording .id = ct_recording_id
197+
198+ speaker_supervisions = []
199+ for segment in transcription_json :
200+ if segment ["speaker_id" ] != speaker :
201+ continue
202+ start_time = float (segment ["start_time" ])
203+ end_time = float (segment ["end_time" ])
204+ alignment = None
205+
206+ if create_word_alignment :
207+ alignment = {"word" : []}
208+ for alig_text , alig_start_time , alig_end_time in segment [
209+ "word_timing"
210+ ]:
211+ if "<" in alig_text or ">" in alig_text :
212+ continue
213+ alignment ["word" ].append (
214+ AlignmentItem (
215+ symbol = alig_text ,
216+ start = float (alig_start_time ),
217+ duration = float (alig_end_time )
218+ - float (alig_start_time ),
219+ )
220+ )
221+
222+ speaker_supervisions .append (
223+ SupervisionSegment (
224+ id = f"{ ct_recording_id } _{ str (int (start_time * 100 )).zfill (6 )} _{ str (int (end_time * 100 )).zfill (6 )} " ,
225+ recording_id = ct_recording_id ,
226+ start = start_time ,
227+ duration = end_time - start_time ,
228+ channel = 0 ,
229+ text = segment ["text" ],
230+ speaker = speaker ,
231+ alignment = alignment ,
232+ )
233+ )
234+
235+ ihm_cuts .append (
236+ MonoCut (
237+ id = ct_recording_id ,
238+ start = 0 ,
239+ duration = ct_recording .duration ,
240+ channel = 0 ,
241+ supervisions = speaker_supervisions ,
242+ recording = ct_recording ,
243+ )
244+ )
245+ continue # skip sc/mc append logic for ihm devices
162246 if is_multi_channel :
163247 # We assume the channel numbers range from 0 to num_channels - 1.
164248 num_channels = len (_listdir_safe (device_path ))
@@ -242,4 +326,4 @@ def process_data(
242326 )
243327 )
244328
245- return sc_cuts , mc_cuts
329+ return sc_cuts , mc_cuts , ihm_cuts
0 commit comments