diff --git a/audio/ffmpeg.py b/audio/ffmpeg.py index 9809b8eb2..b5ce0331a 100644 --- a/audio/ffmpeg.py +++ b/audio/ffmpeg.py @@ -159,8 +159,9 @@ def run_recover_duration(self): c.load("temp_corpus.xml.gz") for r in c.all_recordings(): - assert len(r.segments) == 1, "needs to be a single segment recording" - segment = r.segments[0] + recording_segments = list(r.segments) + assert len(recording_segments) == 1, "needs to be a single segment recording" + segment = recording_segments[0] old_duration = segment.end assert r.audio is not None data, sample_rate = soundfile.read(open(r.audio, "rb")) diff --git a/corpus/data_augmentation.py b/corpus/data_augmentation.py index 0be616aad..7de1a7820 100644 --- a/corpus/data_augmentation.py +++ b/corpus/data_augmentation.py @@ -77,7 +77,8 @@ def run(self): r.max_seg_end = max_seg_end # select noise files for each recording - for i, r in enumerate(c.recordings): + recording_list = list(c.recordings) + for i, r in enumerate(recording_list): audio_name = r.audio target_length = r.max_seg_end reverbed_audio_name = "noised_" + audio_name.split("/")[-1] @@ -91,11 +92,11 @@ def run(self): noise_audios = [] while noise_length < target_length: - random_index = rng.randint(0, len(c.recordings) - 1) + random_index = rng.randint(0, len(recording_list) - 1) while random_index == i: - random_index = random.randint(0, len(c.recordings) - 1) - noise_audios.append(c.recordings[random_index]) - noise_length += c.recordings[random_index].max_seg_end + random_index = random.randint(0, len(recording_list) - 1) + noise_audios.append(recording_list[random_index]) + noise_length += recording_list[random_index].max_seg_end # create temp noise file temp_noise_track_file = "/dev/shm/{id}/tmp_concat_%i.wav" % n @@ -134,15 +135,14 @@ def run(self): command = ffmpeg_head + noise_inputs + filter_head + volume_reduction + mixer + filter_tail self.sh(command) - nr = corpus.Recording() + nr = corpus.Recording(corpus=nc) nr.name = r.name - nr.segments = r.segments nr.speaker_name = r.speaker_name nr.default_speaker = r.default_speaker nr.speakers = r.speakers nr.audio = str(self.out_audio_folder) + "/" + reverbed_audio_name - nc.add_recording(nr) - for s in nr.segments: + for s in r.segments: + nr.add_segment(s) segment_file_names.append(nc.name + "/" + nr.name + "/" + s.name + "\n") nc.dump(self.out_corpus.get_path()) @@ -203,15 +203,14 @@ def run(self): "-ar {base_frequency} '{audio_out}/%s'" % (r.audio, perturbed_audio_name) ) - pr = corpus.Recording() + pr = corpus.Recording(corpus=nc) pr.name = r.name - pr.segments = r.segments pr.speaker_name = r.speaker_name pr.speakers = r.speakers pr.default_speaker = r.default_speaker pr.audio = str(self.out_audio_folder) + "/" + perturbed_audio_name - nc.add_recording(pr) - for s in pr.segments: + for s in r.segments: + pr.add_segment(s) segment_file_names.append(nc.name + "/" + pr.name + "/" + s.name) s.start /= self.speed_factor s.end /= self.speed_factor diff --git a/corpus/filter.py b/corpus/filter.py index fc374af17..054bd49a1 100644 --- a/corpus/filter.py +++ b/corpus/filter.py @@ -32,14 +32,11 @@ def _delete_empty_recordings(corpus: corpus.Corpus, removed_recordings_file: str :param c: Corpus for which to delete the empty recordings. :param removed_recordings_file: File in which to dump all recordings that have been deleted. """ - to_delete = [] - for rec in corpus.all_recordings(): - if not rec.segments: - to_delete.append(rec) - - corpus.remove_recordings(to_delete) with open(removed_recordings_file, "w") as f: - f.write("\n".join(rec.fullname() for rec in to_delete)) + for rec in corpus.all_recordings(): + if not rec.segments: + corpus.remove_recording(rec) + f.write(f"{rec.fullname()}\n") class FilterSegmentsByListJob(Job): diff --git a/corpus/transform.py b/corpus/transform.py index 348ac98df..9605329d3 100644 --- a/corpus/transform.py +++ b/corpus/transform.py @@ -70,22 +70,22 @@ def run(self): words = [s[1] for s in transcriptions[recording.name]] if len(words) == 0 and self.remove_empty_segments: - recordings_to_delete = recording + recordings_to_delete = recording # TODO: this does nothing. continue segments_to_delete = [] - for idx, segment in enumerate(recording.segments): + for segment in recording.segments: left_idx = bisect.bisect_left(times, segment.start) right_idx = bisect.bisect_left(times, segment.end) if left_idx == right_idx and self.remove_empty_segments: - segments_to_delete.append(idx) + segments_to_delete.append(segment) continue segment.orth = " ".join(words[left_idx:right_idx]).replace("&", "&") - for sidx in reversed(segments_to_delete): - del recording.segments[sidx] + for segment in reversed(segments_to_delete): + recording.remove_segment(segment) c.dump(self.output_corpus_path.get_path()) @@ -223,7 +223,7 @@ def run(self): sm_entry.value = "/".join([c.name, split_name, segment.name]) sm.map_entries.append(sm_entry) - new_recording_element.segments.append(segment) + new_recording_element.add_segment(segment) segment_count += 1 # update the time stamp with the recording length and add to ffmpeg merge list @@ -547,15 +547,14 @@ def run(self): nc.speaker_name = c.speaker_name # store index of last segment for r in c.recordings: - sr = corpus.Recording() + sr = corpus.Recording(corpus=nc) sr.name = r.name - sr.segments = r.segments sr.speaker_name = r.speaker_name sr.speakers = r.speakers sr.default_speaker = r.default_speaker sr.audio = r.audio - nc.add_recording(sr) - for s in sr.segments: + for s in r.segments: + sr.add_segment(s) segment_file_names.append(nc.name + "/" + sr.name + "/" + s.name) s.start += self.shift diff --git a/datasets/librispeech.py b/datasets/librispeech.py index fca330ecb..2b8840ce5 100644 --- a/datasets/librispeech.py +++ b/datasets/librispeech.py @@ -136,22 +136,19 @@ def run(self): for transcript in self._transcripts: name = "{0}-{1}-{2:04d}".format(transcript["speaker_id"], transcript["chapter"], transcript["segment"]) - recording = corpus.Recording() + recording = corpus.Recording(corpus=c) recording.name = name recording.speaker_name = transcript["speaker_id"] recording.audio = "{}/{}.flac".format(transcript["path"], name) used_speaker_ids.add(transcript["speaker_id"]) - segment = corpus.Segment() + segment = corpus.Segment(recording=recording) segment.name = name segment.start = 0 segment.end = float("inf") segment.orth = transcript["orth"].strip() - recording.segments.append(segment) - c.recordings.append(recording) - for speaker_id, speaker_info in sorted(self._speakers.items()): if speaker_id not in used_speaker_ids: continue diff --git a/datasets/switchboard.py b/datasets/switchboard.py index d66177509..1361375fa 100644 --- a/datasets/switchboard.py +++ b/datasets/switchboard.py @@ -262,7 +262,7 @@ def run(self): rec_to_segs.pop("sw02167B") for rec_name, segs in sorted(rec_to_segs.items()): - recording = corpus.Recording() + recording = corpus.Recording(corpus=c) recording.name = rec_name recording.audio = os.path.join(self.audio_dir.get_path(), rec_name + ".wav") @@ -272,7 +272,7 @@ def run(self): rec_speaker_id = rec_to_speaker[rec_name]["speaker_id"] for seg in segs: - segment = corpus.Segment() + segment = corpus.Segment(recording=recording) segment.name = seg[0] segment.start = float(seg[1]) segment.end = float(seg[2]) @@ -281,9 +281,6 @@ def run(self): if len(segment.orth) == 0: continue - recording.segments.append(segment) - c.recordings.append(recording) - # add speakers to corpus for speaker_info in rec_to_speaker.values(): speaker = corpus.Speaker() diff --git a/lib/corpus.py b/lib/corpus.py index 6e0b6c37e..61b73c09a 100644 --- a/lib/corpus.py +++ b/lib/corpus.py @@ -21,9 +21,12 @@ class NamedEntity: - def __init__(self): + def __init__(self, name: Optional[str] = None): + """ + :param name: Name of the entity. + """ super().__init__() - self.name: Optional[str] = None + self.name = name def __repr__(self): if self.name is None: @@ -45,7 +48,7 @@ class CorpusParser(sax.handler.ContentHandler): """ This classes methods are called by the sax-parser whenever it encounters an event in the xml-file (tags/characters/namespaces/...). It uses a stack of elements to remember the part of the corpus that - is currently beeing read. + is currently being read. """ def __init__(self, corpus: Corpus, path: str, *, reformat_orth: bool = True): @@ -74,33 +77,24 @@ def startElement(self, name: str, attrs: Dict[str, str]): e.name = attrs["name"] elif name == "subcorpus": assert isinstance(e, Corpus), " may only occur within a or element" - subcorpus = Corpus() - subcorpus.name = attrs["name"] - subcorpus.parent_corpus = e - e.subcorpora.append(subcorpus) + subcorpus = Corpus(name=attrs["name"]) + e.add_subcorpus(subcorpus) self.elements.append(subcorpus) elif name == "include": assert isinstance(e, Corpus), " may only occur within a or element" - path = os.path.join(os.path.dirname(self.path), attrs["file"]) - c = Corpus() - c.load(path) + c = Corpus(load_from=os.path.join(os.path.dirname(self.path), attrs["file"])) if c.name != e.name: print( "Warning: included corpus (%s) has a different name than the current corpus (%s)" % (c.name, e.name) ) for sc in c.subcorpora: - sc.parent_corpus = e.parent_corpus + e.add_subcorpus(sc) for r in c.recordings: - r.corpus = e - e.subcorpora.extend(c.subcorpora) - e.recordings.extend(c.recordings) + e.add_recording(r) e.speakers.update(c.speakers) elif name == "recording": assert isinstance(e, Corpus), " may only occur within a or element" - rec = Recording() - rec.name = attrs["name"] - rec.audio = attrs["audio"] - e.add_recording(rec) + rec = Recording(name=attrs["name"], audio=attrs["audio"], corpus=e) self.elements.append(rec) elif name == "segment": assert isinstance(e, Recording), " may only occur within a element" @@ -168,13 +162,67 @@ class Corpus(NamedEntity, CorpusSection): attribute is set. Corpora with include statements can be read but are written back as a single file. """ - def __init__(self): - super().__init__() + name: Optional[str] + parent_corpus: Optional[Corpus] + _recordings: Dict[str, Recording] # recording-name: Recording + _subcorpora: Dict[str, Corpus] # corpus-name: Corpus + + def __init__( + self, name: Optional[str] = None, *, parent_corpus: Optional[Corpus] = None, load_from: Optional[str] = None + ): + """ + :param name: Corpus name. + :param parent_corpus: If provided, `self` will be directly linked as a subcorpus of :param:`parent_corpus`. + :param load_from: If provided, :func:`Corpus.load` will be directly run with this parameter. + """ + super().__init__(name=name) + + self._subcorpora = {} + self._recordings = {} + + if parent_corpus: + self.parent_corpus.add_subcorpus(self) + else: + self.parent_corpus = None + + if load_from: + self.load(load_from) + + @property + def subcorpora(self) -> Iterable[Corpus]: + """ + :return: Iterable of all top-level subcorpora. + """ + return self._subcorpora.values() + + @subcorpora.setter + def subcorpora(self, value: List[Corpus]): + """ + :param value: List of subcorpora that the recording must hold. + The previous subcorpora will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(c, Corpus) for c in value), ( + f"Can only set Corpus.subcorpora to a list, but tried setting it to {type(value)}." + ) + self._subcorpora = {c.name: c for c in value} - self.parent_corpus: Optional[Corpus] = None + @property + def recordings(self) -> Iterable[Recording]: + """ + :return: Iterable of all top-level recordings. + """ + return self._recordings.values() - self.subcorpora: List[Corpus] = [] - self.recordings: List[Recording] = [] + @recordings.setter + def recordings(self, value: List[Recording]): + """ + :param value: List of recordings that the corpus must hold. + The previous recordings will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(r, Recording) for r in value), ( + f"Can only set Corpus.recordings to a list, but tried setting it to {type(value)}." + ) + self._recordings = {r.name: r for r in value} def segments(self) -> Iterable[Segment]: """ @@ -187,21 +235,47 @@ def segments(self) -> Iterable[Segment]: def get_recording_by_name(self, name: str) -> Recording: """ - :return: the recording specified by its name + :return: the recording specified by its name relative to `self`. """ - for rec in self.all_recordings(): - if rec.fullname() == name: - return rec - assert False, f"Recording '{name}' was not found in corpus" + if "/" not in name: + assert name in self._recordings, f"Recording '{name}' was not found in corpus." + return self._recordings[name] + else: + subcorpus_name, recording_relative_name = name.split("/", maxsplit=1) + return self.get_subcorpus_by_name(subcorpus_name).get_recording_by_name(recording_relative_name) + + def get_subcorpus_by_name(self, name: str) -> Corpus: + """ + :return: The corpus specified by its name relative to `self`. + """ + if "/" not in name: + assert name in self._subcorpora, f"Subcorpus '{name}' was not found in corpus." + return self._subcorpora[name] + else: + _, subcorpus_relative_name = name.split("/", maxsplit=1) + return self.get_subcorpus_by_name(subcorpus_relative_name) def get_segment_by_name(self, name: str) -> Segment: """ - :return: the segment specified by its name + :param name: Segment name relative to the corpus. + Note that it must be at least two levels deep, to also include the recording name. + Example: `my_recording/my_segment`. + :return: the segment specified by its name relative to `self`. """ - for seg in self.segments(): - if seg.fullname() == name: - return seg - assert False, f"Segment '{name}' was not found in corpus" + assert "/" in name, ( + "When running Corpus.get_segment_by_name(), at least two levels of depth 'recording/segment' " + "must be provided, separated with '/'." + ) + recording_name, segment_name = name.split("/", maxsplit=1) + if recording_name in self._recordings: + return self.get_recording_by_name(recording_name).get_segment_by_name(segment_name) + else: + # The first part is the subcorpus, and the second is the rest of the segment. + subcorpus_name, segment_relative_name = recording_name, segment_name + assert subcorpus_name in self._subcorpora, ( + f"When searching for segment '{name}', recording '{recording_name}' was not found in corpus." + ) + return self.get_subcorpus_by_name(subcorpus_name).get_segment_by_name(segment_relative_name) def all_recordings(self) -> Iterable[Recording]: yield from self.recordings @@ -223,37 +297,38 @@ def top_level_speakers(self) -> Iterable[Speaker]: yield from self.speakers.values() def remove_recording(self, recording: Recording): - to_delete = [] - for idx, r in enumerate(self.recordings): - if r is recording or r == recording or r.name == recording: - to_delete.append(idx) - for idx in reversed(to_delete): - del self.recordings[idx] - for sc in self.subcorpora: + if recording.name in self._recordings: + del self._recordings[recording.name] + for sc in self.subcorpora.values(): sc.remove_recording(recording) def remove_recordings(self, recordings: List[Recording]): - recording_fullnames = {recording.fullname() for recording in recordings} - to_delete = [] - for idx, r in enumerate(self.recordings): - if r.fullname() in recording_fullnames: - to_delete.append(idx) - for idx in reversed(to_delete): - del self.recordings[idx] - for sc in self.subcorpora: - sc.remove_recordings(recordings) + for r in recordings: + self.remove_recording(r) def add_recording(self, recording: Recording): + assert recording.name not in self._recordings, ( + f"Tried to add recording {recording.name} to corpus {self.fullname()}, " + "but the recording is already contained in the corpus." + ) assert isinstance(recording, Recording) recording.corpus = self - self.recordings.append(recording) + self._recordings[recording.name] = recording def add_subcorpus(self, corpus: Corpus): + assert corpus.name not in self._subcorpora, ( + f"Tried to add subcorpus {corpus.name} to corpus {self.fullname()}, " + "but the subcorpus is already contained in the corpus." + ) assert isinstance(corpus, Corpus) corpus.parent_corpus = self - self.subcorpora.append(corpus) + self._subcorpora[corpus.name] = corpus def add_speaker(self, speaker: Speaker): + assert speaker.name not in self.speakers, ( + f"Tried to add speaker {speaker.name} to corpus {self.fullname()}, " + "but the speaker is already contained in the corpus." + ) assert isinstance(speaker, Speaker) self.speakers[speaker.name] = speaker @@ -281,10 +356,10 @@ def filter_segments(self, filter_function: FilterFunction): filter all segments (including in subcorpora) using filter_function :param filter_function: takes arguments corpus, recording and segment, returns True if segment should be kept """ - for r in self.recordings: - r.segments = [s for s in r.segments if filter_function(self, r, s)] - for sc in self.subcorpora: - sc.filter_segments(filter_function) + for rec_full_name, r in self._recordings.items(): + self._recordings[rec_full_name]._segments = {s.name: s for s in r.segments if filter_function(self, r, s)} + for subcorpus_full_name in self._subcorpora(): + self._subcorpora[subcorpus_full_name].filter_segments(filter_function) def load(self, path: str, *, reformat_orth: bool = True): """ @@ -350,13 +425,53 @@ def __repr__(self): class Recording(NamedEntity, CorpusSection): - def __init__(self): - super().__init__() - self.audio: Optional[str] = None - self.corpus: Optional[Corpus] = None - self.segments: List[Segment] = [] + """ + This class represents a recording in Bliss format. + """ + + name: Optional[str] + audio: Optional[str] + _segments: Dict[str, Segment] + corpus: Optional[Corpus] + + def __init__(self, name: Optional[str] = None, *, audio: Optional[str] = None, corpus: Optional[Corpus] = None): + """ + :param name: Recording name. + :param audio: Actual path to the audio file which contains the playable media. + :param corpus: If provided, `self` will be directly linked as a recording of :param:`corpus`. + """ + super().__init__(name=name) + + self.audio = audio + self._segments = {} + + if corpus: + corpus.add_recording(self) + else: + self.corpus = None + + @property + def segments(self) -> Iterable[Segment]: + """ + :return: Iterable of all segments in a recording. + """ + return self._segments.values() + + @segments.setter + def segments(self, value: List[Segment]): + """ + :param value: List of segments that the recording must hold. + The previous segments will be overwritten. + """ + assert isinstance(value, list) and all(isinstance(s, Segment) for s in value), ( + f"Can only set Recording.segments to a list, but tried setting it to {type(value)}." + ) + self._segments = {s.name: s for s in value} def fullname(self) -> str: + assert self.corpus is not None, ( + "Please add the recording to a corpus via Corpus.add_recording() before triggering fullname()." + ) return self.corpus.fullname() + "/" + self.name def speaker(self, speaker_name: Optional[str] = None) -> Speaker: @@ -380,24 +495,60 @@ def dump(self, out: TextIO, indentation: str = ""): out.write("%s\n" % indentation) + def get_segment_by_name(self, name: str) -> Segment: + """ + :param name: Name or full name of the segment. + + :return: Segment which is identified by the full name specified in :param:`name`. + """ + assert "/" not in name, ( + "Depth levels 'recording/segment' are not supported for Recording.get_segment_by_name(). " + "Use Corpus.get_segment_by_name() instead." + ) + assert name in self._segments, f"Segment '{name}' was not found in recording '{self.name}'" + return self._segments[name] + def add_segment(self, segment: Segment): + assert segment.name not in self._segments, ( + f"Tried to add segment {segment.name} to recording {self.fullname()}, " + "but the segment is already contained in the recording." + ) assert isinstance(segment, Segment) segment.recording = self - self.segments.append(segment) + self._segments[segment.name] = segment + + def remove_segment(self, segment: Segment): + assert segment.name in self._segments, f"Segment '{segment.name}' was not found in recording '{self.name}'" + del self._segments[segment.name] def get_segment_mapping(self) -> Dict[str, Segment]: """ :return: Mapping from segment fullnames to actual segments. """ - return {seg.fullname(): seg for seg in self.segments} + return self._segments.copy() def __repr__(self): return f"<{self.__class__.__name__} {self.fullname()}>" class Segment(NamedEntity): + """ + This class represents a segment in Bliss format. + """ + + name: Optional[str] + start: Optional[float] + end: Optional[float] + track: Optional[int] + orth: Optional[str] + left_context_orth: Optional[str] + right_context_orth: Optional[str] + speaker_name: Optional[str] + recording: Optional[Recording] + def __init__( self, + name: Optional[str] = None, *, start: float = 0.0, end: float = 0.0, @@ -409,6 +560,7 @@ def __init__( recording: Optional[Recording] = None, ): """ + :param name: Segment name. :param start: Segment start. :param end: Segment end. :param track: Segment track/channel. @@ -416,9 +568,9 @@ def __init__( :param left_context_orth: Optional left context when aligning (specific for RASR alignment). :param right_context_orth: Optional right context when aligning (specific for RASR alignment). :param speaker_name: Speaker name. - :param recording: Recording in which the segment is embedded. + :param recording: If provided, `self` will be directly added to the segments in :param:`recording`. """ - super().__init__() + super().__init__(name=name) self.start = start self.end = end @@ -428,7 +580,10 @@ def __init__( self.right_context_orth = right_context_orth self.speaker_name = speaker_name - self.recording = recording + if recording: + recording.add_segment(self) + else: + self.recording = None def full_orth(self) -> str: """ @@ -437,6 +592,9 @@ def full_orth(self) -> str: return " ".join([s for s in [self.left_context_orth, self.orth, self.right_context_orth] if s]) def fullname(self) -> str: + assert self.recording is not None, ( + "Please add the recording to a corpus via Recording.add_segment() before triggering Segment.fullname()." + ) return self.recording.fullname() + "/" + self.name def speaker(self) -> Speaker: diff --git a/tests/job_tests/corpus/test_transform.py b/tests/job_tests/corpus/test_transform.py index 9b71e54c6..036d0b442 100644 --- a/tests/job_tests/corpus/test_transform.py +++ b/tests/job_tests/corpus/test_transform.py @@ -26,17 +26,13 @@ def _create_corpus_with_structure(corpus_dict: Dict[str, Any]) -> libcorpus.Corp :param corpus_dict: Definition of a corpus in dictionary form. :return: Corpus object defined by the corpus dictionary provided. """ - corpus = libcorpus.Corpus() - corpus.name = corpus_dict["name"] + corpus = libcorpus.Corpus(name=corpus_dict["name"]) for recording_dict in corpus_dict.get("recordings", []): - recording = libcorpus.Recording() - recording.name = recording_dict["name"] + recording = libcorpus.Recording(name=recording_dict["name"]) + corpus.add_recording(recording) for segment_name in recording_dict.get("segments", []): - segment = libcorpus.Segment() - segment.name = segment_name - segment.orth = "" + segment = libcorpus.Segment(name=segment_name, orth="") recording.add_segment(segment) - corpus.add_recording(recording) for subcorpus_dict in corpus_dict.get("subcorpora", []): corpus.add_subcorpus(_CorpusCreatorHelper._create_corpus_with_structure(subcorpus_dict))